Note: project github page: https://github.com/mlagisz/survey_best_paper_awards
Load Scimago Subject Area -level data set and associated meta-data table.
# accessing all the sheets
#sheet_names <- excel_sheets(here("data", "scimagojr 2021 Subject Areas.xlsx"))
sheet_names <- excel_sheets(here("data", "scimagojr 2021 Subject Areas_screening.xlsx"))
sheet_names <- sheet_names[-1]#remove first sheet with meta-data
SA_list_all <- lapply(sheet_names, function(x) {as.data.frame(read_excel(here("data", "scimagojr 2021 Subject Areas.xlsx"), sheet = x))}) #read all sheets to list
names(SA_list_all) <- sheet_names #rename list elements
#lapply(SA_list_all, names) #extract column names
#View(do.call(rbind, lapply(SA_list_all, names))) #View all column names as a table
#unique(do.call(rbind, lapply(SA_list_all, names))) #making sure all have same column names
SAdata <- do.call(rbind, SA_list_all)
#str(SAdata) #one large data frame
#names(SAdata)
names(SAdata) <- gsub(" ","_", names(SAdata)) #change spaces to _ in the column names
names(SAdata) <- gsub("\\.","", names(SAdata)) #remove . the column names
#dim(SAdata)
SAdata <- SAdata %>% drop_na(Subject_area)
#table(is.na(SAdata$Title)) #no empty journal names
SAmeta <- read_excel(here("data", "scimagojr 2021 Subject Areas_screening.xlsx"), sheet = 1, skip = 1) #load Scimago SA meta-data
#dim(SAmeta)
#names(SAmeta)Load award-level data set and associated meta-data table.
#BP load and prepare award-level meta-data
BPmeta <- read_excel(here("data", "Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx"), sheet = 2, skip = 1) #load main award #BPmeta <- read_excel("./data/Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx", sheet = 2, skip = 1) #load main award meta-data
#dim(BPmeta)
#names(BPmeta)
## load and prepare main extracted award-level data
BPdata <- read_excel(here("data","Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx"), sheet = 1) #load main award data
#BPdata <- read_excel("./data/Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx", sheet = 1) #load main award data
#dim(BPdata)
#names(BPdata)
#BPdata <- select(BPdata, !starts_with("...") ) #remove last few empty columns
#BPdata_orig_colnames <- names(BPdata) #store original column names
#BPdata$Award_name <- gsub("^The ", "", BPdata$Award_name) #remove "The" at the beginning of award names
names(BPdata) <- gsub(" ","_", names(BPdata)) #change spaces to _ in the column names
#rename selected data columns
BPdata <- BPdata %>%
rename(Extractor_name = "Name_of_the_extracting_person",
Award_name = "Full_name_of_the_award",
Journal_name = "Full_name_of_the_awarding_journal",
Award_description = "Paste_the_information_text_describing_the_award",
Eligible = "Confirm_eligibility_of_the_award",
Awarding_journal = "Full_name_of_the_awarding_journal",
Awarding_society = "Full_name_of_the_awarding_society",
Awarding_other = "Full_name_of_the_awarding_publisher/other_body",
Career_stage = "Target_career_stage_of_eligible_applicants",
Flexible_eligibility = "Flexibility_of_the_eligibility_criteria",
Assessors_transparency = "Assessor_transparency",
Open_science = "Valuing_Open_Science",
Self_nomination = "Self-nomination_allowed"
)
#names(BPdata)
##check for rows with empty "Scimago Subject Area" values
table(is.na(BPdata$Scimago_Subject_Area)) #4 rows from pilot screening##
## FALSE TRUE
## 264 4
##remove rows with pilot extractions and empty "Scimago Subject Area" values
BPdata <- BPdata[!is.na(BPdata$Scimago_Subject_Area), ]
#remove awards that were duplicate-extracted and excluded at extraction stages
BPdata <- BPdata[BPdata$Row_excluded == 0, ]
dim(BPdata) #223 rows left## [1] 222 54
##NOTE: total of 41 rows were removed as duplicates: 18 awards were doubly- or triply- extracted (total 18 out of 223 = 8%)
#create new variable for awards with description or without
BPdata$Descr_available <- fct_collapse(BPdata$Open_science,
available = c("no", "yes"),
"not available" = "not available"
)Load winners-level data set and associated meta-data table.
#load individual winnersdata
BPindiv <- read_csv(here("data", "BP_awards_lists_SHAREDCOPY - indiv_winners_20230717.csv"), skip = 1) #load individual winners data## Rows: 1083 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (20): award_SA, award_name, award_link, individual, awardee_name, awarde...
## dbl (2): record_count, award_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dim(BPindiv)
#names(BPindiv)
#load individual winners meta-data
BPindiv_meta <- read_csv(here("data", "BP_awards_lists_SHAREDCOPY - indiv_winners_meta-data.csv"), skip = 1) #load individual winners data## Rows: 22 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): column name, description, data type [options]
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dim(BPindiv_meta)
#names(BPindiv_meta)Load country-level productivity SCImago data set and create associated meta-data table.
#load SCImago 2021 country productivity (documents) data downloaded from https://www.scimagojr.com/countryrank.php?year=2021&min=0&min_type=it
COprod <- read_csv(here("data", "scimagojr country rank 2021.csv"), skip = 0) #load data## Rows: 235 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Region
## dbl (7): Rank, Documents, Citable documents, Citations, Self-citations, Cita...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dim(COprod)
#names(COprod)
#create meta-data with columns: data type [options]
COprod_meta <- tibble("column name" = names(COprod),
"description" = c("Rank of a given country across all Scimago Subject Areas.",
"Country name.",
"Country location classification into one of the eight SCImago Major World Regions.",
"Number of documents associated with a goven country published in 2021.",
"Number of citable documents associated with a goven country published in 2021.",
"Number of citations to documents associated with a goven country published in 2021.",
"Number of self-citations to documents associated with a goven country published in 2021.",
"Number of citations per document associated with a goven country published in 2021.",
"H index of a country in 2021." ),
"data type [options]" = c("integer",
"free text",
"free text",
"integer",
"integer",
"integer",
"integer",
"numeric",
"integer"
))
#dim(COprod_meta)
#names(COprod_meta){#id
.class width = 50% height = 50%} Figure S1.
Figure S1. project workflow..
These criteria wer used to increase data consistency and improve comparisons across disciplines: We will only include awards for a single published research contribution in a format of a research article (theses / dissertations are allowed if published as a journal article). Awards must be managed by a journal, publisher, or a learned society. We will exclude awards that are discontinued, and awards that are restricted to applicants from underrepresented groups, e.g., women-only / minorities-only awards.
If some of these awards get excluded during data extraction, we will return to the screening stage for that subject area to find replacement awards until we have 10 awards that are eligible for inclusion, or we run out of journals to screen.
· During screening, we will see if journals note “best paper” awards on their websites, and if not, we will check if journals are run by learned societies. If so, we will check if a learned society awards a “best paper” prize that fits in a given subject area being assessed. If a society awards multiple relevant awards in journals in the same subject area, we will choose one for a highest-ranked journal, or that appears to be most prestigious (e.g., with highest monetary value, oldest). · If less than 10 awards are found for a subject area, we will also check websites of large and reputable publishers to see if they list any relevant “best paper” awards in a given subject area. · For each subject area, we will collate a maximum of 10 awards that meet our selection criteria for data extraction.
##count journals per SA
# count_j_perSA <- SAdata %>%
# count(Subject_area) %>%
# arrange(desc(n))
#table(SAdata$Subject_area)
#min(table(SAdata$Subject_area))
#max(table(SAdata$Subject_area))
#all SA as a simple barplot
SAdata %>%
#filter(!is.na(Subject_area)) %>%
count(Subject_area) %>%
arrange(Subject_area) %>%
ggplot(aes(x = reorder(Subject_area, desc(Subject_area)), y = n)) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
coord_flip()
#### Figure S2. Counts of journal Titles per SCImago ranking lists
by Subejct Area.
The numbers of journals per SA vary widely reflecting the differences in sizes of different research disciplines. Random sampling of journals would result in uneven representatation of disciplines biasing the conclusions toward being representative of the largest disciplines, potentially with largest number of relevant awards.
#names(SAdata)
# length(unique(SAdata$Title)) #number of unique journals
# length(unique(SAdata$Title))/length(SAdata$Title) #60% unique
#reduce to top 100 from each SA
SAdata %>%
#filter(!is.na(Subject_area)) %>%
group_by(Subject_area) %>%
top_n(100, Title) -> SAdata100
#dim(SAdata100)
#count number of duplicated journal titles across all SA top 100 journal titles:
#sum(duplicated(SAdata100$Title)) #644 duplicated journal Titles
#sum(duplicated(SAdata100$Title))/length(SAdata100$Title) #24% duplicated journal Titles (out of 2700)
#display all duplicate rows in data frame
SAdata100 %>%
group_by(Title) %>%
filter(!is.na(Reviewer_name)) %>%
filter(n()>1) %>%
ungroup() %>%
arrange(Title) #%>% View## # A tibble: 30 × 35
## Subject_area Rank Sourceid Title Reviewer_name Link_journal Society_name
## <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 Immunology and … 28 1.30e 4 PLoS… Upama Aich https://jou… <NA>
## 2 Neuroscience 18 1.30e 4 PLoS… Joshua Wang https://jou… <NA>
## 3 Chemical Engine… 8 2.75e 4 Prog… Ju Zhang https://www… <NA>
## 4 Energy 9 2.75e 4 Prog… Andrew Pua https://www… The Combust…
## 5 Chemical Engine… 88 1.95e 4 Shiy… Ju Zhang http://www.… <NA>
## 6 Energy 95 1.95e 4 Shiy… Andrew Pua http://www.… China Assoc…
## 7 Energy 43 2.11e10 Sola… Andrew Pua https://onl… Unknown
## 8 Physics and Ast… 67 2.11e10 Sola… Nina Trubano… https://onl… <NA>
## 9 Earth and Plane… 29 2.83e 4 Spac… Joanna https://www… <NA>
## 10 Physics and Ast… 49 2.83e 4 Spac… Nina Trubano… https://www… <NA>
## # ℹ 20 more rows
## # ℹ 28 more variables: Has_relevant_award <chr>, Link_award <chr>,
## # Notes_award <chr>, Checker_name <chr>, Check_pass <lgl>, Check_notes <chr>,
## # Award_name_extract <chr>, Extract <lgl>, Award_extractor_name <chr>,
## # Extracted <lgl>, Comments <chr>, Archived <lgl>, Issn <chr>, SJR <dbl>,
## # SJR_Best_Quartile <chr>, H_index <dbl>, `Total_Docs_(2021)` <dbl>,
## # `Total_Docs_(3years)` <dbl>, Total_Refs <dbl>, …
Across top 100 journals in each SA ranking list there are 644 duplicated journal titles (23.9%). This indicates that one journal can often be classfied into one or more related Subject Areas (e.g. “Trends in Cognitive Sciences” is included bot in Neuroscience and Psychology SA). This overlap implies that the related awards will also often be relevant to more than one SA. Thus, analysing awards
Meta-data for the SCImago journal rankings by Subject Areas dataset.
# making a scrollable table
kable(SAmeta, "html") %>%
kable_styling("striped", position = "left") %>%
scroll_box(width = "100%", height = "500px")| column name | description | data type [options] |
|---|---|---|
| Subject_area | Name of the Scimago Subject Area (SA) representing 27 major thematic categories. | categorical [one of 27 Subject Areas] |
| Rank | Rank of a given Journal within a given Scimago Subject Area. | integer |
| Sourceid | Unique ID for the journal. | integer |
| Title | Name of the journal. | free text |
| Reviewer_name | Name of a person who reviews the journal trying to find associated eligible awards. | free text |
| Link_journal | Year when award was won/announced for a give individual winner, as reported on the award page or other documents with winner information. | web link |
| Society_name | Name of a person who won the award, as reported on the award page or other documents with winner information. | free text |
| Has_relevant_award | Gender assigned to an individual past awardee names using information from pronouns, images, names) available on award websites, personal or institutional websites, or from first names. | categorical [yes; no] |
| Link_award | Main source of information for assigning gender. Order of preference: pronouns, photo, name. | web link |
| Notes_award | Affiliation institution (usually university) assigned to a winner when the award was received using information available on award websites, personal or institutional websites, or information in the awarded paper. | free text |
| Checker_name | Name of the person who cross-checked a given row of screening data. | free text |
| Check_pass | Main source of the winner’s affiliation information (institution, country) for the year when the award was received or announced. | categorical [yes; no] |
| Check_notes | Any comments for awardee, e.g. award sub-category, or link to additional info from Internet searches. | free text |
| Award_name_extract | Whether the award is shared with other authors of the same article. | categorical [yes; no] |
| Extract | Whether a bio/profile of the winner is shown on the award page/announcement (with more extended information than affiliation information only). | categorical [yes; no] |
| Award_extractor_name | Name of the person who extracted award data . | categorical [yes; no] |
| Extracted | Whether a given award has been extracted. | categorical [yes; no] |
| Comments | Anny comments regarding screened and extracted data in a given row. | free text |
| Archived | DOI of the winning paper. DOI code in a format starting with “10.”. | DOI |
| Issn | ISSN number of teh journal | free text |
| SJR | SCImago Journal Rank indicator expresses the average number of weighted citations received in the selected year by the documents published in the selected journal in the three previous years, i.e. weighted citations received in year X to documents published in the journal in years X-1, X-2 and X-3 | free text |
| SJR Best Quartile | Anny comments regarding extracted data in a given row. | categorical [Q1; Q2; Q3; Q4] |
| H index | The h index expresses the journal’s number of articles (h) that have received at least h citations. It quantifies both journal scientific productivity and scientific impact and it is also applicable to scientists, countries, etc. | number |
| Total Docs. (2021) | Output of the selected period. All types of documents are considered, including citable and non citable documents. | integer |
| Total Docs. (3years) | Published documents in the three previous years (selected year documents are excluded), i.e.when the year X is selected, then X-1, X-2 and X-3 published documents are retrieved. All types of documents are considered, including citable and non citable documents. | integer |
| Total Refs. | Total number of all the bibliographical references in a journal in the selected period. | integer |
| Total Cites (3years) | Number of citations received in the seleted year by a journal to the documents published in the three previous years, –i.e. citations received in year X to documents published in years X-1, X-2 and X-3. All types of documents are considered. | integer |
| Citable Docs. (3years) | Number of citable documents published by a journal in the three previous years (selected year documents are excluded). Exclusively articles, reviews and conference papers are considered. | integer |
| Cites / Doc. (2years) | Average citations per document in a 2 year period. It is computed considering the number of citations received by a journal in the current year to the documents published in the two previous years, –i.e. citations received in year X to documents published in years X-1 and X-2. | number |
| Ref. / Doc. | References per Document is the average number of references per document in the selected year. | number |
| Country | Country where journal is indexed by Scopus. | categorical |
| Region | Eight Major World Regions in used to facilitate sectorial analyses. | categorical |
| Publisher | name of teh journal publisher. | free text |
| Coverage | Year range for which publication data is available on Scopus (?). | free text |
| Categories | Tags for relevant 309 specific subject categories according to Scopus® Classification. | categorical |
Meta-data for the Best Paper awards dataset.
# making a scrollable table
kable(BPmeta, "html") %>%
kable_styling("striped", position = "left") %>%
scroll_box(width = "100%", height = "500px")| column name | description | data type [options] |
|---|---|---|
| Timestamp | System-recorded data and time of when the given row of data was submitted via the Google Form. | time |
| Name of the extracting person | Name of the extracting person: Please use your name in one consistent form (should be matching the name you gave us on EOI and recorded on GoogleSheet for award screening). It is best to always use your first and last name, to avoid confusion. | free text |
| Scimago Subject Area | Scimago Subject Area: Record name of the Scimago Subject Area (SA) you are doing extractions for - as in the Subject_area column in the GoogleSheet with journal and awards lists. Should be exactly as used in the original list of Subject Areas. | categorical [one of 27 Subject Areas] |
| Full name of the award | Full name of the award: Record specific award name, as in the Award_name_extract column in the GoogleSheet with journal and awards lists. | free text |
| Source of the information on the award | Source of the information on the award: Ideally, enter a link to an online page/document with information on award eligibility and assessment criteria. If not available, could be also a link to any document describing the award. If you cannot find any information about the award enter “NA”. You can paste more than one link separated by “;”. | weblink |
| Paste the information text describing the award | Paste the information text describing the award: Ideally, copy and paste the section of the online page/document with information on award eligibility and assessment criteria only. This description text will be used for text-mining analyses. Do not copy and paste list of past winners or generic information about the journal or society. Do not copy and paste any images. If you cannot find any information about the award enter “NA”. | free text |
| Confirm eligibility of the award | Confirm eligibility of the award: · This award is for a single published research contribution in a format of a research article (theses / dissertation are allowed if published as a journal article). · This award is managed by a journal, publisher, or a learned society. · This award is NOT discontinued or restricted to applicants from underrepresented groups, e.g., women-only / minorities-only. Options: yes (eligible award), no (not eligible - make note on the reasons below and enter NA to the following compulsory questions, then submit), unclear (make note on the reasons below) | categorical [yes; no; unclear] |
| Comment on the award eligibility | Comment on the award eligibility: Any comments, e.g. reasons for excluding this award. | free text |
| Full name of the awarding journal | Full name of the awarding journal: Record if award is associated with a specific journal. Enter “NA” if not associated with a specific journal. | free text |
| Full name of the awarding society | Full name of the awarding society: Record if award is associated with a specific society. Enter “NA” if not associated with a specific society. | free text |
| Full name of the awarding publisher/other body | Full name of the awarding publisher/other body: Record if award is associated with a specific publisher (group of journals) or awarding organisation other than learned society or journal. Enter “NA” if not associated with a specific publisher. | free text |
| Comment on the awarding body | Comment on the awarding body: Any comments, e.g. if more than one journal considered for the award. | free text |
| Award cash | Award cash: Code whether there is any cash coming with the award. - Select “yes” if award description mentions any cash given to the winners. - Select “no” if award description does not mention any cash given to the winners. Travel awards, publishing discounts, plenary talks, etc. do not count as cash awards. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Inclusivity phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. | free text |
| Comment on the award cash | Comment on the award cash: Add comments on cash amount, if provided, and note any other benefits winners receive, as relevant (e.g., travel awards, publishing discounts, plenary talks) | free text |
| Target career stage of eligible applicants | Target career stage of eligible applicants, as stated in the award information: As stated in the award information. More than one choice is possible. If it is for any paper in a journal/s, select “any career stage”. Options: student, early-career, mid-career, any career stage, unclear. | categorical [student; early-career; mid-career; any career stage; unclear] |
| Comment on the career stage | Comment on the career stage: Any comments, e.g. paste the phrasing of who is eligible for the award. | free text |
| Flexibility of the eligibility criteria | Flexibility of the eligibility criteria: Code whether explicitly allowing for career interruptions in eligibility timeframes: - Select “not applicable” if description only states that published research has to be performed while studying for a degree OR if there is no eligibility limit in terms of years since an event (e.g. a PhD or author’s age. - Select “yes” if the description says that there is an eligibility limit of years after a degree to apply for the. award and this limit can be extended in special cases”. - Select “no” if the description says that there is an eligibility limit of years after a degree to apply for the award but it does not mention that this limit can be extended in special cases. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Eligibility phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not applicable, not available. | categorical [yes; no; not applicable; not available] |
| Eligibility phrasing | Eligibility phrasing: Wording of the eligibility criteria in relation to career stage in the relevant documentation, if available. | free text |
| Inclusivity statement | Inclusivity statement: Code whether underrepresented/minority groups are encouraged to apply for the award (this does not mean that the award is restricted to underrepresented groups, e.g., women-only) or award information includes declarations of commitment to equity / diversity / inclusivity (EDI): - Select “yes” if award description mentions underrepresented/minority groups or EDI. - Select “no” if award description does not mention anything about underrepresented/minority groups or EDI. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: Extract this information from award descriptions only - do not include information from other documents not directly related to the award, e.g. journal/society/institutional policies or mission statements. In the next question (“Inclusivity phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. | categorical [yes; no; not available] |
| Inclusivity phrasing | Inclusivity phrasing: Wording of the inclusivity statement in the relevant documentation, if available. | free text |
| Assessor transparency | Assessor transparency: Code whether information is provided on who will be conducting assessments of candidate papers (for example, that editors-in-chief will be deciding): - Select “yes” if information is provided, e.g. names, or stating that journal editors will be doing this, or mentioning specific existing committee that is described somewhere else (e.g. on a society webpage). - Select “no” if description does not mention any information on the assessors. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). In the next question (“Assessor phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. | categorical [yes; no; not available] |
| Assessor phrasing | Assessor phrasing: Wording of the information on who will be conducting the assessments, if available. | free text |
| Process transparency | Process transparency: Code whether breakdown of the applicants / candidates by gender or geographic region is publicly available: - Select “yes” if such information is provided (e.g. on a society webpage). - Select “no” if such information is not provided. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Process phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. | categorical [yes; no; not available] |
| Process phrasing | Process phrasing: Wording of the information on the transparency of the process, e.g. if breakdown of the applicants / candidates by gender or geographic region is publicly available. | free text |
| Feedback availability | Feedback availability: Code whether award information included an offer of constructive feedback for unsuccessful applicants: - Select “yes” if such feedback can be provided (e.g. on request). - Select “no” if such feedback is not provided or not mentioned. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Feedback phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. | categorical [yes; no; not available] |
| Feedback phrasing | Feedback phrasing: Wording of the information on whether/how feedback will be provided, if available. | free text |
| Criteria transparency | Criteria transparency: Code whether assessment criteria are detailed (more than one vague sentence) or vague (often one vague sentence, e.g. “assessed on innovation and novelty”): - Select “yes” if assessment criteria are more than one vague sentence (e.g., mentions things like readability of the language, sample size, performing alternative tests/analyses to check robustness of the results, sharing data or code, registering the study/protocol - whatever is relevant to a given research field). - Select “no” if assessment criteria are only one vague sentence (e.g. award is for “best paper”, “outstanding contribution”, “excellent research”, “novel insights”, etc.). - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Criteria phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. | categorical [yes; no; not available] |
| Criteria phrasing | Criteria phrasing: Wording of the information on the assessment criteria, if available. | free text |
| Valuing Open Science | Valuing Open Science: Code whether any Open Science practices (data, code, materials sharing, preregistration, transparency of reporting, etc.) are explicitly included in the assessment criteria: - Select “yes” if award description mentions valuing any of the Open Science practices (e.g., sharing data or code, registering the study/protocol, performing replication studies - whatever is relevant to a given research field). - Select “no” if the award description does not mention valuing any of the Open Science practices. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Valuing Open Science phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. | categorical [yes; no; not available] |
| Valuing Open Science phrasing | Valuing Open Science phrasing: Wording of the information on the assessment criteria valuing Open Science practices, if available. | free text |
| Self-nomination allowed | Self-nomination allowed: Code whether candidates can self-nominate for the award. - Select “yes” if the award description explicitly mentions that candidates can self-nominate. - Select “no” if the award description states that candidates/papers need to be nominated by somebody else or does not explicitly mention that candidates can self-nominate. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Self-nomination phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available, | categorical [yes; no; not available] |
| Self-nomination phrasing | Self-nomination phrasing: Wording of the information on how to nominate (e.g., using a checkbox on the manuscript submission form), if available. | free text |
| Letter required | Letter required: Code whether candidates are required to provide nomination/recommendation letter/s: - Select “yes” if award description explicitly mentions that candidates have to provide nomination or support letters from others. - Select “no” if the award description states that candidates/papers do not need to provide letters from somebody else or does not explicitly mention anything about such letters. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Letter requirement phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. | categorical [yes; no; not available] |
| Letter requirement phrasing | Letter requirement phrasing: Wording of the information on the requirement for written nominations / reference letters, if available. | free text |
| Awardee list source | Awardee list source: Source of the information on the past winners - paste a link to a webpage, file name, personal information, etc., showing a list of past winners (any year span). Write “not available” if no such list exists. | weblink |
| Awardee list most recent year | Awardee list most recent year: The most recent year for which information on past awardees is available (enter whole number, e.g. 2022). Leave empty if no such list exists. | integer |
| Awardee list earliest year | Awardee list earliest year: The earliest year for which information on past awardees is available (enter whole number, e.g. 1999). Leave empty if no such list exists. | integer |
| Comments on awardees list | Comments on awardees list: Any comments for awardees list, e.g. if Internet searches might be needed to locate announcements with the names of the past winners. | free text |
| Comments_general | Comments_general: Any other notes and comments on issues, assumptions, or seeking additional information from the award committees / contacts. | free text |
| Checked | Whether a given row of extracted data has been cross-checked. | categorical [yes; no] |
| Checker_name | Name of the person who cross-checked a given row of extracted data. | free text |
| Checker_comments | Anny comments regarding extracted data in a given row. | free text |
| Row_excluded | Whether a given row of extracted data has been cross-checked. | integer [1 = yes; 0 = no] |
| Award_individual | Whether a given award recognises individual (selected) authors rather than all authors of a winning paper. | categorical [yes; no] |
| Award_impact_metrics_mentioned | Whether award criteria mention impact metrics (number of citations, downloads) as basis of winner selection. | categorical [yes; no] |
| Award_impact_metrics_only | Whether impact metrics (number of citations, downloads) ar the sole basis of winner selection. | categorical [yes; no] |
| Award_impact_metrics_comment | Quote relevent sections of award description if impact metrics are mentioned. Add any comments regarding award criteria and impact metrics. | free text |
| Award_contact_provided | Whether specific contact information (email) is provided on the award webpage or publicly available application/description documents. | categorical [yes; no] |
| Award_integrity_mentioned | Whether award description states how potential conflictes of interests will be managed | categorical [yes; no; not available] |
| Award_integrity_comment | Quote relevent sections of award description on how potential conflicts of interests will ba managed. | free text |
| Award_cash_max_USD_pperson | The maximum reported amount of cash awarded per winning paper, recalculated online into USD. | Integer |
| N_winners_extracted | Number of individual winners from years 2001-2022 extracted for this awards (in indiv_winners data set). | Integer |
| Archived_files | Whether award website and over information have been archived as pdf files in the dedicated project folder. | categorical [yes; no] |
| Comment_extraction | Anny comments regarding extracted data in a given row. | free text |
Meta-data for the individual winners dataset.
# making a scrollable table
kable(BPindiv_meta, "html") %>%
kable_styling("striped", position = "left") %>%
scroll_box(width = "100%", height = "500px")| column name | description | data type [options] |
|---|---|---|
| award_SA | Name of the Scimago Subject Area (SA) you are doing extractions for - as in the Subject_area column in the GoogleSheet with journal and awards lists. Should be exactly as used in the original list of Subject Areas. | categorical [one of 27 Subject Areas] |
| record_count | Subsequent numbers for counting extracted winners within each award. | integer |
| award_name | Name of the award - as in the Award_name column in the main GoogleSheet with awards main data extraction. | free text |
| award_link | Weblink to the main Internet page with award description / information. | weblink |
| individual | Whether an award is individual or article-focused. Individual awards are defined as awards that are usually not shared equally between all article authors. For example the award names as a winner only the ECR authors or only the first author (unless there are more than one ECR/first authors who contributed equally). | categorical [yes; no] |
| award_year | Year when award was won/announced for a give individual winner, as reported on the award page or other documents with winner information. | integer |
| awardee_name | Name of a person who won the award, as reported on the award page or other documents with winner information. | free text |
| awardee_gender | Gender assigned to an individual past awardee names using information from pronouns, images, names) available on award websites, personal or institutional websites, or from first names. | categorical [F = female; M = male; ? - unassigned] |
| gender_source | Main source of information for assigning gender. Order of preference: pronouns, photo, name. | categorical [pronouns; photo; name] |
| affiliation_institution | Affiliation institution (usually university) assigned to a winner when the award was received using information available on award websites, personal or institutional websites, or information in the awarded paper. | free text |
| affiliation_country | Affiliation country assigned to a past winner for the year when the award was received using information available on award websites, personal or institutional websites, or information in the awarded paper. | free text |
| affiliation_info_source | Main source of the winner’s affiliation information (institution, country) for the year when the award was received or announced. | categorical [award page; award announcement; article; other] |
| awardee_comment1 | Any comments for awardee, e.g. award sub-category, or link to additional info from Internet searches. | free text |
| shared | Whether the award is shared with other authors of the same article. | categorical [yes; no] |
| awardee_profile_shown | Whether a bio/profile of the winner is shown on the award page/announcement (with more extended information than affiliation information only). | categorical [yes; no] |
| awardee_photo_shown | Whether a photo of the winner is shown on the award page/announcement. | categorical [yes; no] |
| awardee_comment2 | Any other comments on the awardee or awarded paper, e.g. award sub-category, or link to additional info from Internet searches. | free text |
| awarded_paper_ref | Bibliographic reference of the winning paper. | free text [reference in any long format] |
| awarded_paper_doi | DOI of the winning paper. DOI code in a format starting with “10.”. | DOI |
| checked | Whether a given row of extracted data has been cross-checked. | categorical [yes; no] |
| checker_name | Name of the person who cross-checked a given row of extracted data. | free text |
| checker_comment | Anny comments regarding extracted data in a given row. | free text |
Overall: Total number of unique names of awards:
222.
Total number of unique names of awards: 27.
Total number of unique names of awarding journals: 177.
Total number of unique names of awarding societies: 127.
Total number of unique names of awarding other bodies: 22.
Total number of awards without award description: 11 (4.9%)
By awarding or funding body: Total number of awards associated with a journal or a few related journals: 176 (78.9 %) Total number of awards associated with a learned society: 144 (64.6 %) Total number of awards associated with other organisation (e.g. a publisher, university, charity): 39 (17.5 %). Most commonly mentioned other organisations: 15 mention “Elsevier”, 6 mention “Wiley”).
By award starting date Award year of the earliest listed winner used as a proxy: 1923. Number of awards with the earliest listed winner within 2011-2023: 107 (48 %).
By target career stage: Total number of awards for
‘any career stage’ authors: 179 (80.3%).
Total number of awards for ‘early career’ authors: 31 (13.9%). Total
number of awards for ‘student’ authors: 27 (12.1%). Total number of
awards for ‘early career’ or ‘student’ authors: 20 (9%).
Total number of awards for ’mid-career’authors: 5 (2.2%).
By focus on individuals (“individual award”) or whole article
(“team award”): Total number of awards for individual authors:
66 (29.6%).
Total number of awards for individual authors with any career stage
being eligible: 28 (42.4%).
Total number of awards for individual authors with inflexible time
limits for eligibility:21.
Total number of awards for individual authors with flexible time limits
for eligibility: 5.
Summarizing other award characteristics: Total
number of awards with inclusivity statement: 2 ( 0.9%).
Total number of awards with assessor transparency: 114 ( 51.1%).
Total number of awards with assessment integrity mentioned: 19 (
8.5%).
Total number of awards with process transparency: 2 ( 0.9%).
Total number of awards with feedback available: 0 ( 0%).
Total number of awards with contact details available: 38 ( 17%).
Total number of awards with self-nomination allowed: 28 ( 12.6%). Total
number of awards with required nomination letter: 38 ( 17%). Total
number of awards with non-vague criteria: 40 ( 19%). Total number of
awards with non-vague criteria: 21 ( 10%). Total number of awards with
non-vague criteria: 8 ( 3.8%). Total number of awards which value Open
Science: 1 ( 0.5%). Total number of awards with cash award: 112 (
50.2%).
Overall: Total number of unique names of individual winners records: 1083.
##Check award names - same award could have been extracted in different SA, or different awards have same names
# table(duplicated(BPdata$Award_name)) #0 duplicated, 223 unique
## Simple counts
# #eligible awards per SA
# count_awards_perSA <- BPdata %>%
# count(Scimago_Subject_Area) #%>%
# #arrange(desc(n))
#
# count_awards <- BPdata %>%
# # group_by(Scimago_Subject_Area) %>%
# count(Descr_available) %>%
# arrange(desc(n)) #description not available for 12 awards (5%)
# BPdata %>%
# group_by(Awarding_journal) %>%
# filter(n() > 1) %>%
# select(Awarding_journal) %>%
# filter(!is.na(Awarding_journal)) %>%
# count() #no journals with more than one award across different
# BPdata %>%
# group_by(Awarding_society) %>%
# filter(n() > 1) %>%
# select(Awarding_society) %>%
# filter(!is.na(Awarding_society)) %>%
# count() #some societies with more than one award (max.4)
BPdata %>%
group_by(Awarding_other) %>%
filter(n() > 0) %>%
select(Awarding_other) %>%
filter(!is.na(Awarding_other)) %>%
count() #%>% View()#list of other awarding bodies mentioned with counts
# #count number of unique awards per year:
# BPindiv %>%
# #filter(!is.na(affiliation_country)) %>%
# group_by(award_year) %>%
# summarise(count = n_distinct(award_name))
##How many awards per Scimago_Subject_Area
# table(BPdata$Scimago_Subject_Area) #note many SA with <10 extracted awards - this is because of many journals being shared between SA
table(BPdata$Awarding_journal == "NA") #46 not associated with a journal
table(BPdata$Awarding_society == "NA") #80 not associated with a learned society
table(BPdata$Career_stage, useNA = "always")
table(BPdata$Award_individual, useNA = "always") #62 yes
table(BPdata$Flexible_eligibility, useNA = "always") #only 5 - "yes"
#View(BPdata[BPdata$Flexible_eligibility == "yes", ]) #see rows with "yes"
#two-way table for the Career_stage and Flexible_eligibility
table(BPdata$Career_stage, BPdata$Flexible_eligibility, useNA = "always") #most "not applicable" is for "any career stage"
#two-way table for the Flexible_eligibility and Award_individual
table(BPdata$Flexible_eligibility, BPdata$Award_individual, useNA = "always") #flexible ones are for ECRs only
#two-way table for the Career_stage and Award_individual
table(BPdata$Career_stage, BPdata$Award_individual, useNA = "always") #individual are mostly for ECRs
#BPdata$Award_name[BPdata$Career_stage == "unclear"]
#BPdata$Award_name[BPdata$Career_stage == "early-career" & BPdata$Award_individual == "no"]
#BPdata$Award_name[BPdata$Career_stage == "early-career" & BPdata$Award_individual == "no"]
#length(BPdata$Award_name[BPdata$Career_stage == "any career stage" & BPdata$Award_individual == "no"]) #151
#three-way table
table(BPdata$Career_stage, BPdata$Flexible_eligibility, BPdata$Award_individual) #most "not applicable" is for "any career stage"
#length(BPdata$Award_name[BPdata$Career_stage == "any career stage" & BPdata$Award_individual == "no"]) #151
length(BPdata$Award_name[BPdata$Award_individual == "no" & BPdata$Career_stage == "any career stage" & BPdata$Flexible_eligibility == "not applicable"]) #146 not applicable flexibility
length(BPdata$Award_name[BPdata$Award_individual == "no" & BPdata$Career_stage == "any career stage" & BPdata$Flexible_eligibility == "not available"]) #5 with no description
length(BPdata$Award_name[BPdata$Award_individual == "no" & BPdata$Career_stage == "any career stage" & BPdata$Flexible_eligibility == "yes"]) #0 with flexibility
length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Career_stage == "any career stage"]) #29 not limited to specific career stage
length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Flexible_eligibility == "not applicable"]) #39 not applicable flexibility
length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Flexible_eligibility == "not available"]) #2 with no description
length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Flexible_eligibility == "yes"]) #5 with flexibility
table(BPdata$Inclusivity_statement, useNA = "always") #only 2 "yes"
BPdata$Award_name[BPdata$Inclusivity_statement == "yes"] #see rows with "yes"
table(BPdata$Assessors_transparency, useNA = "always") #115 "yes"
#View(BPdata[BPdata$Assessors_transparency == "yes", ]) #see rows with "yes"
names(BPdata)
# sum(str_count(tolower(BPdata$Assessor_phrasing), "editor"), na.rm = TRUE) #163 - counting all mentions
# sum(str_detect(BPdata$Assessor_phrasing, "editor"), na.rm = TRUE) #37 - counting once per award
table(BPdata$Process_transparency, useNA = "always") #only 1 "yes"
#BPdata$Award_name[BPdata$Process_transparency == "yes"] #see award name with "yes"
table(BPdata$Self_nomination, useNA = "always") #28 yes
#View(BPdata[BPdata$Self_nomination == "yes", ]) #see rows with "yes"
table(BPdata$Letter_required, useNA = "always") #38 yes
#View(BPdata[BPdata$Letter_required == "yes", ]) #see rows with "yes"
table(BPdata$Letter_required, BPdata$Self_nomination, useNA = "always") #two-way table
dim(BPdata[BPdata$Letter_required == "yes" & BPdata$Self_nomination == "yes", ])[1] #mentioning letter and self-nomination
dim(BPdata[BPdata$Letter_required == "no" & BPdata$Self_nomination == "no", ])[1] #not mentioning letter and self-nomination
table(BPdata$Feedback_availability, useNA = "always") #0 "yes"
table(BPdata$Award_contact_provided, useNA = "always") #38 "yes"
table(BPdata$Criteria_transparency, useNA = "always") #40 yes
#View(BPdata[BPdata$Criteria_transparency == "yes", ]) #see rows with "yes"
table(BPdata$Award_impact_metrics_mentioned, useNA = "always") #21 "yes"
table(BPdata$Award_impact_metrics_only, useNA = "always") #8 "yes"
table(BPdata$Open_science, useNA = "always") #1 "yes"
View(BPdata[BPdata$Open_science == "yes", ]) #see rows with "yes": Social Sciences: Review Of Research Award by American Educational Research Association. Two journals are considered: Review of Educational Research. Review of Research in Education
table(BPdata$Awardee_list_most_recent_year, useNA = "always") #a few <2020 and 6 NA
#View(BPdata[is.na(BPdata$Awardee_list_most_recent_year), ]) #see rows with NA
table(BPdata$Awardee_list_earliest_year, useNA = "always") #some very old, 19 NA
#View(BPdata[is.na(BPdata$Awardee_list_earliest_year), ]) #see rows with NA
#hist(BPdata$Awardee_list_earliest_year)
length(BPdata$Awardee_list_earliest_year[BPdata$Awardee_list_earliest_year > 2010 & !is.na(BPdata$Awardee_list_earliest_year)] ) #107 from 2011-2023A. Screening effort - count numbers of journals screened to get 10 shortlisted awards
#load journal screening summary dataset:
BPscreening <- read_csv(here("data","BP_awards_lists_SHAREDCOPY - scimagojr_2021_SA.csv")) #load award screening summary data TODO## Rows: 29 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): Subject_Area, Reveiwer_name, Reviewer_email, Screening_status (ass...
## dbl (4): Nr, N_records_imported, N_records_screened, Extraction1_N_awards_e...
## lgl (2): Extraction1_comments, Extraction2_winners_checking_comments
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
BPscreening <- BPscreening[!is.na(BPscreening$Subject_Area), ] #remove 2 extra rows without Subject_Area
count_awards_perSA <- BPdata %>% count(Scimago_Subject_Area) #count included awards per SA
BPscreening$N_included <- count_awards_perSA$n #add included award counts per SA
BPscreening$N_excluded <- BPscreening$N_records_screened - BPscreening$N_included
BPscreening$Subject_Area <- as.factor(BPscreening$Subject_Area) #convert to factor
## reshape into long dataframe format:
BPscreening %>% select(Subject_Area, N_included, N_excluded) %>% gather(key = status, count, N_included:N_excluded) -> BPscreening_long
#reorder status levels and edit labels:
BPscreening_long <- BPscreening_long %>%
mutate(status = factor(status, levels = rev(c("N_excluded", "N_included"))))
BPscreening_long$status <- as.character(BPscreening_long$status) #convert to character
BPscreening_long$status[BPscreening_long$count <= 5] <- "N_included_low" #use to mark SA with 5 or less awards included
figure1A <- BPscreening_long %>%
mutate(status = recode(status, 'N_excluded' = 'not found', 'N_included_low' = 'found and included (<=5)', 'N_included' = 'found and included (>5 awards)')) %>%
ggplot(aes(x = reorder(Subject_Area, desc(Subject_Area)), y = count, fill = status)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,25,50,75,100)) +
theme_bw() +
scale_fill_manual(values = c("#CDC8B1", "#8B8878", "#EEE8CD")) +
labs(x = "Scimago Subject Area", y = "Count of screened Scimago journals", fill = "Award: ") +
theme(legend.position = "top", legend.justification=c(0, 1), axis.title.x = element_text(size = 10)) +
geom_hline(yintercept = 10, linetype = "dotted", color = "red", size = 0.6) +
scale_x_discrete(position = "top")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
B. Include awards with and without award description
figure1B <- BPdata %>%
count(Scimago_Subject_Area, Descr_available) %>%
arrange(desc(n)) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Descr_available)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_bw() +
scale_fill_manual(values = c("#669933", "gray80")) +
labs(x = "", y = "Count of included awards", fill = "Award description:") +
scale_x_discrete(labels = NULL) + labs(x = "") + #used to remove vertical labels, also breaks = NULL
theme(legend.position = "top", axis.title.x = element_text(size = 10))Word counts per award description:
#add new variable with counts of words in the award description:
BPdata$Award_description_wordcount <- str_count(BPdata$Award_description, "\\w+")
#hist(str_count(BPdata$Award_description, "\\w+"), breaks = 20)
BPdata[is.na(BPdata$Scimago_Subject_Area), ] #no NA, but no description was coded as NA, so if 1 then change to 0 (no description)## # A tibble: 0 × 56
## # ℹ 56 variables: Timestamp <dttm>, Extractor_name <chr>,
## # Scimago_Subject_Area <chr>, Award_name <chr>,
## # Source_of_the_information_on_the_award <chr>, Award_description <chr>,
## # Eligible <chr>, Comment_on_the_award_eligibility <chr>,
## # Awarding_journal <chr>, Awarding_society <chr>, Awarding_other <chr>,
## # Comment_on_the_awarding_body <chr>, Award_cash <chr>,
## # Comment_on_the_award_cash <chr>, Career_stage <chr>, …
BPdata$Award_description_wordcount[BPdata$Award_description_wordcount < 2] <- 0 #if 1 then change to 0 (no description)
#dim(BPdata[BPdata$Award_description_wordcount < 100, ])[1] #90 descriptions have less than 100 words! (includes 12 with no description)
#dim(BPdata[BPdata$Descr_available == "available" & BPdata$Award_description_wordcount < 150, ])[1] #120 descriptions have less than 200 words! (excludes 12 with no description), so 120/211 (75%)
# BPdata[BPdata$Award_description_wordcount > 250, c("Scimago_Subject_Area", "Award_name", "Award_description")] #18 awards (9%)
# BPdata[BPdata$Award_description_wordcount > 400, c("Scimago_Subject_Area", "Award_name", "Award_description")] #5 awards (2%)
# BPdata[BPdata$Award_description_wordcount > 500, c("Scimago_Subject_Area", "Award_name", "Award_description")] #2 awards (1%)Median length of award description: 123.
Plot of award description lengths - overall:
figure1C <- BPdata %>%
filter(!is.na(Award_description_wordcount)) %>%
ggplot(aes(x = Award_description_wordcount)) +
geom_histogram(binwidth = 20, fill = "grey60") +
theme_bw() +
labs(x = "Number of words in award description", y = "Award count") +
scale_x_continuous(breaks = c(seq(0, 1000, by = 100))) +
theme(legend.position = "none", axis.title.x = element_text(size = 10))Plot of award description lengths - by Subject Area
BPdata %>%
filter(!is.na(Award_description_wordcount)) %>%
ggplot(aes(x = Award_description_wordcount)) +
geom_histogram(binwidth = 20, fill = "grey60") +
theme_bw() +
labs(x = "Number of words in award description", y = "Award count") +
scale_x_continuous(breaks = c(seq(0, 1000, by = 100))) +
theme(legend.position = "none", axis.title.x = element_text(size = 10)) +
facet_wrap(~Scimago_Subject_Area, ncol = 3)How old are the awards? NOTE: Using the year of oldest listed past winner as a proxy.
Overall:
#table(!is.na(BPdata$Awardee_list_earliest_year)) # values available
#plot overall - beeswarm
figure1D <- BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
ggplot(aes(y = Awardee_list_earliest_year, x = Row_excluded)) +
geom_beeswarm(color = "#000000", alpha = 0.2, cex = 3.3) +
coord_flip() +
theme_bw() +
labs(y = "Past winners earliest year", x = "") +
scale_x_discrete(labels = NULL) + labs(x = "") + #used to remove vertical labels, also breaks = NULL
scale_y_continuous(breaks = c(seq(1900, 2030, by = 10))) +
theme(legend.position = "none", axis.title.x = element_text(size = 10)) +
annotate("rect", ymin = 2001, ymax = 2022, xmin = -1.5, xmax = 1.5, alpha = .1,fill = "#008B00")Plot by Subject Area:
#plat by SA - beeswarm
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = Awardee_list_earliest_year)) +
geom_beeswarm(color = "#000000", alpha = 0.1) +
coord_flip() +
theme_bw() +
labs(y = "Past winners earliest year", x = "Scimago Subject Area") +
scale_y_continuous(breaks = c(seq(1900, 2030, by = 10))) +
theme(legend.position = "none", axis.title.x = element_text(size = 10))Combine 4 panels and save:
#assemble the panels using patchwork package
figure1 <- (figure1A + figure1B) / figure1C / figure1D +
plot_layout(widths = c(1, 1), heights = c(3, 1, 1)) +
plot_annotation(tag_levels = "A")
#ggsave(plot = figure1, here("plots", "Fig1ABCD_screening_descriptions_v0.png"), width = 18, height = 18, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure1, here("plots", "Fig1ABCD_screening_descriptions_v0.pdf"), width = 18, height = 18, units = "cm", scale = 1.2)Figure 2A - Awarding bodies
# table(BPdata$Awarding_journal == "NA")
# table(BPdata$Awarding_society == "NA")
# table(BPdata$Awarding_other == "NA")
#create new variable with simple categorisation of awarding body type mentioned:
BPdata2 <- BPdata %>%
mutate(
Awarded_by = case_when(Awarding_journal != "NA" & Awarding_society == "NA" & Awarding_other == "NA" ~ "journal",
Awarding_journal == "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other == "NA" ~ "society",
Awarding_journal == "NA" & BPdata$Awarding_society == "NA" & BPdata$Awarding_other != "NA" ~ "other",
Awarding_journal != "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other == "NA" ~ "journal&society",
Awarding_journal != "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other != "NA" ~ "journal&society&other",
Awarding_journal != "NA" & BPdata$Awarding_society == "NA" & BPdata$Awarding_other != "NA" ~ "journal&other",
Awarding_journal == "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other != "NA" ~ "society&other",
TRUE ~ "none")
)
#table(BPdata$Awarded_by)
#create new variable for awards with description or without
BPdata2 %>%
select(Award_name, Awarding_journal, Awarding_society, Awarding_other) %>%
mutate(
across(c(Awarding_journal, Awarding_society, Awarding_other), ~ if_else(. == "NA", FALSE, TRUE))
) -> BPdata2
#make upset plot using library(ggupset)
figure2A <- BPdata2 %>%
column_to_rownames(var = "Award_name") %>%
as_tibble(rownames = "Award_name") %>%
gather(Body, Award, -Award_name) %>%
filter(Award) %>%
select(-Award) %>%
mutate(Body = recode(Body, 'Awarding_journal' = 'journal', 'Awarding_society' = 'society', 'Awarding_other' = 'other')) %>%
group_by(Award_name) %>%
summarize(Awarding_bodies = list(Body)) %>%
ggplot(aes(x = Awarding_bodies)) +
geom_bar(fill = "#a70042", width = 0.8) +
scale_x_upset(order_by = c("freq")) +
theme_combmatrix(combmatrix.panel.striped_background = FALSE,
combmatrix.panel.point.color.fill = "#a70042",
combmatrix.panel.line.color = "#a70042",
plot.title = element_text(family = "sans", size = 20, face = "plain", color = "black"),
axis.title.x = element_text(family="sans", size = 14, color = "black", face="plain"),
axis.title.y = element_text(family="sans", size = 14, color = "black", face = "plain", vjust = -2),
) +
labs(title = "A", x = "Awarding bodies combination", y = "Award count")Figure 2B - target awardee career stages
#table(BPdata$Career_stage) #mai options: student, early-career, mid-career, any career stage, unclear
#create new variable with separate career stage values as a list
BPdata2 %>%
mutate(
Career_stage_list = str_split(BPdata$Career_stage, pattern = ", ")
) -> BPdata2
#BPdata$Career_stage_list <- str_split(BPdata$Career_stage, pattern = ", ") #sam as above but modifies original data frame
#make upset plot using library(ggupset)
figure2B <- BPdata2 %>%
ggplot(aes(x = Career_stage_list)) +
geom_bar(fill = "#8B6969",width = 0.8) +
scale_x_upset(order_by = c("freq")) +
theme_combmatrix(combmatrix.panel.striped_background = FALSE,
combmatrix.panel.point.color.fill = "#8B6969",
combmatrix.panel.line.color = "#8B6969",
plot.title = element_text(family = "sans", size = 20, face = "plain", color = "black"),
axis.title.x = element_text(family="sans", size = 14, color = "black", face="plain"),
axis.title.y = element_text(family="sans", size = 14, color = "black", face = "plain", vjust = -2),
) +
labs(title = "B", x = "Target career stages combination", y = "")Combine 2 panels and save
#assemble the panels using patchwork package
figure2 <- figure2A / figure2B +
plot_layout(ncol = 2, nrow = 1, widths = c(1, 1)) #+ plot_annotation(tag_levels = "A") #does not work with this lot type
#ggsave(plot = figure2, here("plots", "Fig2AB_award_descriptions_v0.png"), width = 18, height = 8, units = "cm", bg = "white", dpi = "retina", scale = 1.6)
#ggsave(plot = figure2, here("plots", "Fig2AB_award_descriptions_v0.pdf"), width = 18, height = 8, units = "cm", scale = 1.6)#BPdata %>%
# count(Open_science) %>%
# ggplot(aes(x = Open_science, y = n, fill = Open_science)) +
# geom_col(width = 0.7) +
# geom_text(position = position_stack(vjust = 0.5), aes(label = n)) +
# theme_classic() +
# coord_flip() +
# labs(x = "Open Science valued", y = "Article count", fill = "Open_science") +
# theme(legend.position = "none", axis.title.x = element_text(size = 10))Figure 3A - Summary plots of key award characteristics
BPdata %>% select(Award_individual, Flexible_eligibility, Inclusivity_statement, Assessors_transparency, Award_integrity_mentioned, Process_transparency, Self_nomination, Letter_required, Feedback_availability, Award_contact_provided) -> BPdata3
## reshape into long dataframe format:
BPdata3 %>% gather(key = var_name, value = value, 1:ncol(BPdata3)) -> BPdata3long
#str(BPdata3long)
#reorder var_name levels
BPdata3long <- BPdata3long %>% mutate(var_name = factor(var_name, levels = rev(c("Award_individual",
"Flexible_eligibility",
"Inclusivity_statement",
"Assessors_transparency",
"Award_integrity_mentioned",
"Process_transparency",
"Self_nomination",
"Letter_required",
"Feedback_availability",
"Award_contact_provided"
))))
#reorder value levels:
BPdata3long <- BPdata3long %>% mutate(value = factor(value, levels = rev(c("no",
"yes",
"not applicable",
"not available"))))
figure3A <- BPdata3long %>%
group_by(var_name) %>%
count(var_name, value) %>%
ggplot(aes(x = var_name, y = n, fill = value)) +
#geom_bar(aes(fill = value), stat = "identity", position = position_dodge(0.9)) +
geom_bar(aes(fill = value), stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("#E5E5E5", "#CCCCCC", "#C1FFC1", "#FA8072")) +
labs(x = "Award characteristics", y = "Award count", fill = "Award scores") +
scale_x_discrete(labels = rev(c("Award is focused on individual authors",
"Eligibility career timeline is flexible",
"Inclusivity statement or encouragement is provided",
"Assessors are revealed",
"Award integrity mentioned",
"Process is transparent",
"Self-nomination is allowed",
"Nomination letter is required",
"Feedback is availabile",
"Award contact point is provided"
))) +
theme_classic()Figure 3B - Summary plots of key award characteristics
BPdata %>% select(Criteria_transparency, Award_impact_metrics_mentioned, Award_impact_metrics_only, Open_science) -> BPdata3
## reshape into long dataframe format:
BPdata3 %>% gather(key = var_name, value = value, 1:ncol(BPdata3)) -> BPdata3long
#str(BPdata3long)
#reorder var_name levels
BPdata3long <- BPdata3long %>% mutate(var_name = factor(var_name, levels = rev(c( "Criteria_transparency",
"Award_impact_metrics_mentioned",
"Award_impact_metrics_only",
"Open_science"))))
#reorder value levels:
BPdata3long <- BPdata3long %>% mutate(value = factor(value, levels = rev(c("no",
"yes",
"not available"))))
figure3B <- BPdata3long %>%
group_by(var_name) %>%
count(var_name, value) %>%
ggplot(aes(x = var_name, y = n, fill = value)) +
#geom_bar(aes(fill = value), stat = "identity", position = position_dodge(0.9)) +
geom_bar(aes(fill = value), stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("#E5E5E5", "#C1FFC1", "#FA8072")) +
labs(x = "Award criteria", y = "Award count", fill = "Award scores") +
scale_x_discrete(labels = rev(c("Assessment criteria are detailed out",
"Impact metrics are mentioned in criteria",
"Impact metrics are only criteria",
"Open Science practices are valued"))) +
theme_classic()Combine 2 panels and save
#assemble the panels using patchwork package
figure3 <- figure3A / figure3B +
plot_layout(ncol = 1, nrow = 2, heights = c(2, 1)) +
plot_annotation(tag_levels = "A")
#ggsave(plot = figure3, here("plots", "Fig3AB_award_descriptions_v0.png"), width = 18, height = 10, units = "cm", bg = "white", dpi = "retina", scale = 1.2)
#ggsave(plot = figure2, here("plots", "Fig3AB_award_descriptions_v0.pdf"), width = 18, height = 10, units = "cm", scale = 1.2)Figure SX - Award_individual by SA
#table(BPdata$Award_individual) #as a table - 62 are individual(28%)
#table(BPdata$Scimago_Subject_Area, BPdata$Award_individual) #as a table - 9 disciplines have 0!
BPdata %>%
mutate(Award_individual = factor(Award_individual, levels = (c("no", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Award_individual) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_individual)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Award is focused on individual authors:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Flexible_eligibility by SA
#table(BPdata$Flexible_eligibility, useNA = "always") #as a table
#table(BPdata$Scimago_Subject_Area, BPdata$Flexible_eligibility, useNA = "always") #as a table
BPdata %>%
mutate(Flexible_eligibility = factor(Flexible_eligibility, levels = c("no", "yes", "not applicable", "not available"))) %>%
count(Scimago_Subject_Area, Flexible_eligibility) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Flexible_eligibility)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#C1FFC1", "#CCCCCC", "#E5E5E5")) +
#scale_fill_manual(values = c("#FA8072", "#FA807210", "#F1FFC1", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Eligibility career timeline is flexible:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Inclusivity_statement by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Inclusivity_statement) #as a table
BPdata %>%
mutate(Inclusivity_statement = factor(Inclusivity_statement, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Inclusivity_statement) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Inclusivity_statement)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Inclusivity statement or encouragement is provided:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Inclusivity_statement by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Assessors_transparency) #as a table
BPdata %>%
mutate(Assessors_transparency = factor(Assessors_transparency, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Assessors_transparency) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Assessors_transparency)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Assessors are revealed (names or as journal editors):") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Inclusivity_statement by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Process_transparency) #as a table
BPdata %>%
mutate(Process_transparency = factor(Process_transparency, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Process_transparency) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Process_transparency)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Process is transparent:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Self_nomination by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Self_nomination) #as a table
BPdata %>%
mutate(Self_nomination = factor(Self_nomination, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Self_nomination) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Self_nomination)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Self-nomination is allowed:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Letter_required by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Letter_required) #as a table
BPdata %>%
mutate(Letter_required = factor(Letter_required, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Letter_required) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Letter_required)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Nomination letter is required:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Feedback_availability by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Feedback_availability) #as a table
BPdata %>%
mutate(Feedback_availability = factor(Feedback_availability, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Feedback_availability) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Feedback_availability)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Feedback is availabile:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Award_contact_provided by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Award_contact_provided) #as a table
BPdata %>%
mutate(Award_contact_provided = factor(Award_contact_provided, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Award_contact_provided) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_contact_provided)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Award contact point is provided:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Criteria_transparency by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Criteria_transparency) #as a table
BPdata %>%
mutate(Criteria_transparency = factor(Criteria_transparency, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Criteria_transparency) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Criteria_transparency)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Assessment criteria are detailed out:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Award_impact_metrics_mentioned by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Award_impact_metrics_mentioned) #as a table
BPdata %>%
mutate(Award_impact_metrics_mentioned = factor(Award_impact_metrics_mentioned, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Award_impact_metrics_mentioned) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_impact_metrics_mentioned)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Impact metrics are mentioned in criteria:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Award_impact_metrics_only by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Award_impact_metrics_only) #as a table
BPdata %>%
mutate(Award_impact_metrics_only = factor(Award_impact_metrics_only, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Award_impact_metrics_only) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_impact_metrics_only)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Impact metrics are only criteria:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Figure SX - Open_science by SA
#table(BPdata$Scimago_Subject_Area, BPdata$Open_science) #as a table
BPdata %>%
mutate(Open_science = factor(Open_science, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
count(Scimago_Subject_Area, Open_science) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Open_science)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Open Science practices are valued:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Award descriptions - wordcloud of frequencies all words.
#Using packages tidytext and stopwords
Award_description_txt <- tibble(txt = tolower(BPdata$Award_description))
Award_description_txt <- Award_description_txt %>% unnest_tokens(output = word, input = txt, token = "words", to_lower = TRUE) #restructure all descriptions as one-token-per-row format
Award_description_txt <- Award_description_txt %>% anti_join(get_stopwords()) #remove stop words from library(stopwords) ## Joining with `by = join_by(word)`
Award_description_txt <- Award_description_txt %>% anti_join(get_stopwords()) #remove stop words from library(stopwords) ## Joining with `by = join_by(word)`
Award_description_txt$word <- tokenize_word_stems(Award_description_txt$word) #make all word stems lowercase
word.freq <- Award_description_txt %>% count(word, sort = TRUE) #count words
## plot wordcloud using library(wordcloud) for top 100 words
#word.freq %>% with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(12, "Paired")))Award descriptions - count mentions of specific words (stemmed):
#create list of specific words (stemmed) to count within strings
specific.words <- c("best", "outstand", "impact", "original", "significant", "merit", "excell", "innovat", "novel", "creativ", "transparen", "reliab", "robust", "reproduc", "replica", "rigor", "rigour", "trust")
#prepare award descriptions as a single lowercase string
descriptions <- BPdata %>%
filter(!is.na(Award_description)) %>%
select(Award_description) %>%
tolower() #single lowercase string
#sum of all mentions for each word
specific.words.mentions <- specific.words %>%
map_int(~ str_count(tolower(descriptions), .x))
#prepare award descriptions while keeping them separate for each award
descriptions2 <- tolower(BPdata$Award_description) #vector of lowercase strings
#sum of mentions per award for each word (counts only one mention per award)
specific.words.mentions2 <- specific.words %>%
map_int(~ sum(str_detect(descriptions2, .x), na.rm = TRUE))
# ## doing the same as above, but manually:
# #count all mentions of words(parts) individually, e.g.:
# sum(str_count(tolower(BPdata$Award_description), "best"), na.rm = TRUE) #169
# #counting once per award, e.g.
# sum(str_detect(BPdata$Award_description, "best"), na.rm = TRUE) #82Plot frequencies of specific words - all mentions in award descriptions.
words.df <- tibble(Words = specific.words,
Count_all = specific.words.mentions,
Count_once = specific.words.mentions2)
figure4A <- words.df %>%
ggplot(aes(x = reorder(Words, Count_all), y = Count_all)) +
geom_col(width = 0.8, fill = "#838B83") +
coord_flip() +
scale_y_continuous(breaks = c(0, 50, 100, 150), limits = c(0, 200)) +
theme_bw() +
labs(x = "Word stem", y = "Count of all mentions") +
annotate(geom = "rect",xmin = 8.5, xmax = Inf, ymin = -Inf, ymax = Inf, fill = "#A52A2A", alpha = 0.1) +
annotate(geom = "rect",xmin = -Inf, xmax = 8.5, ymin = -Inf, ymax = Inf, fill = "#00FF00", alpha = 0.1) +
theme(legend.position = "none", axis.title.x = element_text(size = 10))Plot frequencies of specific words - first mentions in award descriptions.
figure4B <- words.df %>%
ggplot(aes(x = reorder(Words, Count_all), y = Count_once)) +
geom_col(width = 0.8, fill = "#C1CDC1") +
coord_flip() +
scale_y_continuous(breaks = c(0, 50, 100, 150), limits = c(0, 200)) +
theme_bw() +
labs(x = "Word stem", y = "Count of first mention per award") +
scale_x_discrete(labels = NULL) + labs(x = "") + #used to remove vertical labels, also breaks = NULL
annotate(geom = "rect",xmin = 8.5, xmax = Inf, ymin = -Inf, ymax = Inf, fill = "#A52A2A", alpha = 0.1) +
annotate(geom = "rect",xmin = -Inf, xmax = 8.5, ymin = -Inf, ymax = Inf, fill = "#00FF00", alpha = 0.1) +
theme(legend.position = "none", axis.title.x = element_text(size = 10))Combine 2 panels and save:
#assemble the panels using patchwork package
figure4 <- figure4A / figure4B +
plot_layout(ncol = 2, nrow = 1) +
plot_annotation(tag_levels = "A")
#ggsave(plot = figure4, here("plots", "Fig4AB_words_counts_v0.png"), width = 18, height = 8, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure4, here("plots", "Fig4AB_words_counts_v0.pdf"), width = 18, height = 8, units = "cm", scale = 1.2)How many awards extracted overall?
length(unique(BPdata$Award_name[BPdata$N_winners_extracted != 0])) #62 - number from the records in the award-level data set## [1] 61
length(unique(BPindiv$award_name)) #62 - number from records in past winner's data## [1] 62
How many awards extracted per SA?
#from BPdata:
BPdata %>%
group_by(Scimago_Subject_Area) %>%
filter(N_winners_extracted != 0) %>%
count(Scimago_Subject_Area) -> awards_perSA
# #from BPindiv:
# BPindiv %>%
# group_by(award_SA) %>%
# summarise(count = n_distinct(award_name))
#plot counts where >0 awards per SA
p_count.awards.SA <- BPindiv %>%
group_by(award_SA) %>%
summarise(count = n_distinct(award_name)) %>%
ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = count)) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
coord_flip() +
theme_bw() +
labs(x = "Scimago Subject Area", y = "Award count") How many records (names) extracted overall?
#sum(BPdata$N_winners_extracted, na.rm = TRUE) #number from the records in the award-level data set
#dim(BPindiv)[1] #number of records in past winner's dataHow many records (names) extracted per SA?
#from BPdata:
BPdata %>%
group_by(Scimago_Subject_Area) %>%
select(N_winners_extracted) %>%
filter(!is.na(N_winners_extracted)) %>%
summarise(sum_winners = sum(N_winners_extracted), .groups = 'drop') -> winners_perSA1## Adding missing grouping variables: `Scimago_Subject_Area`
winners_perSA1[winners_perSA1$sum_winners == 0, ] #8 SA without any extracted names## # A tibble: 8 × 2
## Scimago_Subject_Area sum_winners
## <chr> <dbl>
## 1 Business, Management and Accounting 0
## 2 Decision Sciences 0
## 3 Economics, Econometrics and Finance 0
## 4 Engineering 0
## 5 Health Professions 0
## 6 Materials Science 0
## 7 Mathematics 0
## 8 Social Sciences 0
dim(winners_perSA1[winners_perSA1$sum_winners != 0, ])[1] #number of SA with any extracted names## [1] 19
winners_perSA1[winners_perSA1$sum_winners != 0, ] #table of SA with any extracted names## # A tibble: 19 × 2
## Scimago_Subject_Area sum_winners
## <chr> <dbl>
## 1 Agricultural and Biological Sciences 69
## 2 Arts and Humanities 17
## 3 Biochemistry, Genetics and Molecular Biology 152
## 4 Chemical Engineering 33
## 5 Chemistry 95
## 6 Computer Science 45
## 7 Dentistry 14
## 8 Earth and Planetary Sciences 15
## 9 Energy 21
## 10 Environmental Science 19
## 11 Immunology and Microbiology 16
## 12 Medicine 37
## 13 Multidisciplinary 40
## 14 Neuroscience 134
## 15 Nursing 63
## 16 Pharmacology, Toxicology and Pharmaceutics 42
## 17 Physics and Astronomy 122
## 18 Psychology 81
## 19 Veterinary 64
#from BPindiv:
#table(BPindiv$award_SA)
dim(table(BPindiv$award_SA, useNA = "always")) #19 SA with any extracted names## [1] 20
BPindiv %>%
group_by(award_SA) %>% count() -> winners_perSA1_extracted #does not show SA with 0 extracted
#plot counts where >0 winners per SA
p_count.awardees.SA <- BPindiv %>%
group_by(award_SA) %>% count() %>%
ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = n)) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
coord_flip() +
theme_bw() +
#scale_x_discrete(labels = NULL) + labs(x = "") + #used to remove vertical labels, also breaks = NULL
labs(x = "Scimago Subject Area", y = "Awardee count")
# View(left_join(x = winners_perSA1, y = winners_perSA1_extracted, by = c("Scimago_Subject_Area" = "award_SA"))) #check if matching numbersCombine plots of total counts of awards and winners per SA
#assemble the panels using patchwork package
count.awards.awardees_SA <- p_count.awards.SA / p_count.awardees.SA +
plot_layout(ncol = 2, nrow = 1) +
plot_annotation(tag_levels = "A")
#ggsave(here("plots", "count.awards.awardees_SA_v1.png"), width = 18, height = 8, units = "cm", dpi = "retina")
#ggsave(here("plots", "count.awards.awardees_SA_v1.pdf"), width = 18, height = 8, units = "cm")How many records per decade?
table(BPindiv$award_year, useNA = "always")##
## 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
## 9 11 11 13 13 20 16 20 21 31 41 45 56 58 74 78
## 2017 2018 2019 2020 2021 2022 <NA>
## 83 92 93 102 107 89 0
#create decade variable 2001-2010, 2011-2020, 2021-2022
BPindiv %>% mutate(Decade = case_when(
award_year >= 2001 & award_year <= 2010 ~ "2001-2010",
award_year >= 2011 & award_year <= 2020 ~ "2011-2020",
award_year >= 2021 & award_year <= 2022 ~ "2021-2022",
award_year == "Veterinary" ~ "Biology",
TRUE ~ NA)
) -> BPindiv
table(BPindiv$Decade, useNA = "always")##
## 2001-2010 2011-2020 2021-2022 <NA>
## 165 722 196 0
How many records per decade and SA?
table(BPindiv$award_SA, BPindiv$Decade)##
## 2001-2010 2011-2020 2021-2022
## Agricultural and Biological Sciences 24 36 9
## Arts and Humanities 1 14 2
## Biochemistry, Genetics and Molecular Biology 37 84 31
## Chemical Engineering 0 16 17
## Chemistry 0 78 17
## Computer Science 1 32 12
## Dentistry 3 10 1
## Earth and Planetary Sciences 0 11 4
## Energy 3 16 2
## Environmental Science 1 15 3
## Immunology and Microbiology 0 12 4
## Medicine 4 21 12
## Multidisciplinary 10 23 7
## Neuroscience 32 83 23
## Nursing 20 39 4
## Pharmacology, Toxicology and Pharmaceutics 12 24 6
## Physics and Astronomy 0 102 20
## Psychology 1 64 16
## Veterinary 16 42 6
How many individuals shared awards?
table(BPindiv$shared, useNA = "always") #157 shared (14% of names), usually shared by 2 authors)##
## no yes <NA>
## 926 157 0
table(BPindiv$Decade, BPindiv$shared) #more common recently?##
## no yes
## 2001-2010 148 17
## 2011-2020 630 92
## 2021-2022 148 48
table(BPindiv$award_SA, BPindiv$shared) # sharing most frequent in Computer Science##
## no yes
## Agricultural and Biological Sciences 65 4
## Arts and Humanities 17 0
## Biochemistry, Genetics and Molecular Biology 126 26
## Chemical Engineering 28 5
## Chemistry 84 11
## Computer Science 10 35
## Dentistry 14 0
## Earth and Planetary Sciences 12 3
## Energy 9 12
## Environmental Science 17 2
## Immunology and Microbiology 14 2
## Medicine 35 2
## Multidisciplinary 32 8
## Neuroscience 115 23
## Nursing 61 2
## Pharmacology, Toxicology and Pharmaceutics 26 16
## Physics and Astronomy 120 2
## Psychology 79 2
## Veterinary 62 2
How many awards had article information?
NOTE: during first round of data extraction, we located around 3/4 of winning articles - these were articles where title or DOI was available on award page or announcement. However, article information was sometimes only publicly listed for recently awarded articles. Additional Internet searches increased the number of identified articles to 795.
table(is.na(BPindiv$awarded_paper_ref)) #200 missing (out of 1083 = 18%)##
## FALSE TRUE
## 883 200
table(is.na(BPindiv$awarded_paper_doi)) #201 missing (out of 108s = 18%)##
## FALSE TRUE
## 882 201
How many have gender assigned and how?
table(BPindiv$awardee_gender, useNA = "always") #0 missing or unclear##
## F M <NA>
## 426 657 0
round(table(BPindiv$awardee_gender, useNA = "always")/length(BPindiv$awardee_gender) * 100, 1) #as percentages##
## F M <NA>
## 39.3 60.7 0.0
table(BPindiv$gender_source, useNA = "always") #0 missing - change to "not found"##
## name photo pronouns <NA>
## 653 262 168 0
round(table(BPindiv$gender_source, useNA = "always")/length(BPindiv$awardee_gender) * 100, 1) #as percentages##
## name photo pronouns <NA>
## 60.3 24.2 15.5 0.0
Gender counts overall
BPindiv %>%
group_by(award_SA) %>%
count(awardee_gender)## # A tibble: 38 × 3
## # Groups: award_SA [19]
## award_SA awardee_gender n
## <chr> <chr> <int>
## 1 Agricultural and Biological Sciences F 33
## 2 Agricultural and Biological Sciences M 36
## 3 Arts and Humanities F 4
## 4 Arts and Humanities M 13
## 5 Biochemistry, Genetics and Molecular Biology F 62
## 6 Biochemistry, Genetics and Molecular Biology M 90
## 7 Chemical Engineering F 12
## 8 Chemical Engineering M 21
## 9 Chemistry F 41
## 10 Chemistry M 54
## # ℹ 28 more rows
BPindiv %>%
group_by(Decade) %>%
count(awardee_gender)## # A tibble: 6 × 3
## # Groups: Decade [3]
## Decade awardee_gender n
## <chr> <chr> <int>
## 1 2001-2010 F 65
## 2 2001-2010 M 100
## 3 2011-2020 F 284
## 4 2011-2020 M 438
## 5 2021-2022 F 77
## 6 2021-2022 M 119
Gender plot overall
## set the levels in order we want:
BPindiv$Gender <- factor(BPindiv$awardee_gender)
levels(BPindiv$Gender) <- c("female", "male")
## plot for a all SA and years - not stacked, with count values:
BPindiv %>%
# group_by(award_SA) %>%
count(Gender) %>%
ggplot(aes(x = Gender, y = n)) +
geom_bar(aes(fill = Gender), stat = "identity", position = position_dodge(0.9)) +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
theme_bw() +
theme(legend.position = "none", legend.box = "horizontal", axis.text = element_text(size = 10)) +
labs(fill = "Awardee gender:") +
labs(x = "Awardees", y = "Awardee count") Gender plots by Decade
## plot for a all awards by Decade - not stacked, with count values (40-41% in each decade):
BPindiv %>%
group_by(Decade) %>%
count(Gender) %>%
ggplot(aes(x = Decade, y = n, fill = Gender)) +
geom_bar(aes(fill = Gender), stat = "identity", position = position_dodge(0.8), width = 0.8, col = "white") + # use , position = "fill" for proportion plot
# coord_flip() +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
theme_bw() +
theme(legend.position="top", legend.box = "horizontal", axis.text = element_text(size = 10)) +
labs(fill = "Awardee gender:") +
labs(x = "Decade", y = "Awardee count") ## plot for a all awards by Decade - horizontal, not stacked, with count values:
figure5A <- BPindiv %>%
group_by(Decade) %>%
count(Gender) %>%
ggplot(aes(y = Decade, x = n, fill = Gender)) +
geom_bar(aes(fill = Gender), stat = "identity") +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
theme_bw() +
theme(legend.position="top", legend.text = element_text(size = 8), legend.title = element_text(size = 8),
axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
guides(fill = guide_legend(nrow = 1, byrow = TRUE, reverse = TRUE)) +
labs(fill = "Awardee gender:") +
labs(y = "Decade", x = "Awardee counts")
## plot for a all awards by Decade - horizontal, stacked, with proportion values:
figure5B <- BPindiv %>%
group_by(Decade) %>%
count(Gender) %>%
ggplot(aes(x = Decade, y = n, fill = Gender)) +
geom_bar(aes(fill = Gender), stat = "identity", position = "fill") + # use , position = "fill" for proportion plot
coord_flip() +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
theme_bw() +
theme(legend.position = "none", legend.box = "horizontal", axis.text = element_text(size = 10)) +
labs(fill = "Awardee gender:") +
labs(x = "Decade", y = "Awardee proportion") Gender plot by SA
## plot for a all awards by SA - not stacked, with count values:
BPindiv %>%
group_by(award_SA) %>%
count(Gender) %>%
ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = n, fill = Gender)) +
geom_bar(aes(fill = Gender), stat = "identity", position = "fill") + # use , position = "fill" for proportion plot
coord_flip() +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
theme_bw() +
theme(legend.position="top", legend.box = "horizontal", axis.text = element_text(size = 10)) +
labs(fill = "Awardee gender:") +
labs(x = "Scimago Subject Area", y = "Awardee proportion") Gender plot by SA and Decade
## plot for a all awards by SA and Decade - not stacked, with count values:
BPindiv %>%
group_by(award_SA, Decade) %>%
count(Gender) %>%
ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = n, fill = Gender)) +
geom_bar(aes(fill = Gender), stat = "identity", position = position_dodge(0.8), width = 0.8, col = "white") + # use , position = "fill" for proportion plot
facet_wrap(~Decade, scales = "fixed", nrow = 1, ncol = 3) +
coord_flip() +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
theme_bw() +
theme(legend.position="top", legend.box = "horizontal", axis.text = element_text(size = 10)) +
labs(fill = "Awardee gender:") +
labs(x = "Scimago Subject Area", y = "Awardee count") How many had affiliation info and from where (affiliation_info_source)?
length(unique(BPindiv$affiliation_country)) #44 unique country names## [1] 44
table(is.na(BPindiv$affiliation_country)) #45 values missing (out of 1083 = 4%)##
## FALSE TRUE
## 1029 54
table(BPindiv$affiliation_info_source, useNA = "always") #affiliations sources##
## article award announcement award page other
## 487 159 355 29
## <NA>
## 53
round(table(BPindiv$affiliation_info_source, useNA = "always")/1058*100,1) #as percentages##
## article award announcement award page other
## 46.0 15.0 33.6 2.7
## <NA>
## 5.0
#count number of unique countries per decade:
BPindiv %>%
filter(!is.na(affiliation_country)) %>%
group_by(Decade) %>%
summarise(count = n_distinct(affiliation_country))## # A tibble: 3 × 2
## Decade count
## <chr> <int>
## 1 2001-2010 26
## 2 2011-2020 39
## 3 2021-2022 29
#count number of unique countries per year:
BPindiv %>%
filter(!is.na(affiliation_country)) %>%
group_by(award_year) %>%
summarise(count = n_distinct(affiliation_country))## # A tibble: 22 × 2
## award_year count
## <dbl> <int>
## 1 2001 3
## 2 2002 5
## 3 2003 5
## 4 2004 6
## 5 2005 4
## 6 2006 7
## 7 2007 6
## 8 2008 8
## 9 2009 7
## 10 2010 12
## # ℹ 12 more rows
#count countries:
BPindiv %>%
filter(!is.na(affiliation_country)) %>%
count(affiliation_country) %>%
arrange(desc(n))## # A tibble: 43 × 2
## affiliation_country n
## <chr> <int>
## 1 USA 491
## 2 UK 70
## 3 China 60
## 4 Germany 51
## 5 Canada 40
## 6 Australia 34
## 7 Japan 23
## 8 Netherlands 22
## 9 New Zealand 22
## 10 France 20
## # ℹ 33 more rows
# #count countries - view top 20:
# BPindiv %>%
# filter(!is.na(affiliation_country)) %>%
# count(affiliation_country) %>%
# arrange(desc(n)) %>%
# filter(n>5) %>%
# View()
#recode top 10 countries + other
BPindiv$Country10 <- recode(BPindiv$affiliation_country,
"USA" = "USA",
"UK" = "UK",
"Australia" = "Australia",
"China" = "China",
"Germany" = "Germany",
"Canada" = "Canada",
"Japan" = "Japan",
"Netherlands" = "Netherlands",
"Italy" = "Italy",
"France" = "France",
.default = "other")
#table(BPindiv$Country10)
#recode countries with n>5 (20)
#BPindiv$Country20 <- recode(BPindiv$affiliation_country,
# "USA" = "USA",
# "UK" = "UK",
# "Australia" = "Australia",
# "China" = "China",
# "Germany" = "Germany",
# "Canada" = "Canada",
# "Japan" = "Japan",
# "Netherlands" = "Netherlands",
# "Italy" = "Italy",
# "France" = "France",
# "Austria" = "Austria",
# "Spain" = "Spain",
# "Sweden" = "Sweden",
# "Switzerland" = "Switzerland",
# "India" = "India",
# "South Korea" = "South Korea",
# "Brazil" = "Brazil",
# "Singapore" = "Singapore",
# "Taiwan" = "Taiwan",
# "Belgium" = "Belgium",
# .default = "other")
#table(BPindiv$Country20)Country affiliation plots overall
#all countries as a simple barplot
BPindiv %>%
filter(!is.na(affiliation_country)) %>%
count(affiliation_country) %>%
arrange((n)) %>%
ggplot(aes(x = reorder(affiliation_country, n), y = n)) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
coord_flip()#top 10 countries - two-colour barplot
BPindiv %>%
filter(!is.na(Country10)) %>%
count(Country10) %>%
arrange((n)) %>%
ggplot(aes(x = reorder(Country10, n), y = n)) +
geom_bar(aes(fill= Country10 == "other"), stat = "identity", position = position_dodge(0.9)) +
scale_fill_manual(values = c("#DCDCDCFF", "#000000")) +
coord_flip() +
theme_bw() +
theme(legend.position="none", axis.text = element_text(size = 10)) +
labs(x = "Affiliation country", y = "Awardee count") Country affiliation plot by SA
#top 10 countries by SA
BPindiv %>%
filter(!is.na(Country10)) %>%
count(award_SA, Country10) %>%
ggplot(aes(y = reorder(award_SA, desc(award_SA)), x = n, fill = Country)) +
#geom_bar(stat = "identity")
geom_bar(aes(fill = Country10), stat = "identity") +
scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
theme_bw() +
theme(legend.position="top", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
labs(fill = "Awardee affiliation country:") +
labs(x = "Scimago Subject Area", y = "Awardee count") #ggsave(here("plots", "awardee_country_SA_v1.png"), width = 18, height = 8, units = "cm", dpi = "retina")
#ggsave(here("plots", "awardee_country_SA_v1.pdf"), width = 18, height = 8, units = "cm")Country affiliation plot by decade
#top 10 countries by Decade - horizontal, stacked, with count values
figure5C <- BPindiv %>%
filter(!is.na(Country10)) %>%
count(Decade, Country10) %>%
ggplot(aes(y = Decade, x = n, fill = Country10)) +
geom_bar(aes(fill = Country10), stat = "identity") + # use , position = "fill" for proportion plot
scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
theme_bw() +
theme(legend.position="top", legend.text = element_text(size = 8), legend.title = element_text(size = 8),
axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE, reverse = TRUE)) +
labs(fill = "Awardee affiliation country:") +
labs(y = "Decade", x = "Awardee counts")
#top 10 countries by Decade - horizontal, stacked, with proportion values
figure5D <- BPindiv %>%
filter(!is.na(Country10)) %>%
count(Decade, Country10) %>%
ggplot(aes(y = Decade, x = n, fill = Country10)) +
geom_bar(aes(fill = Country10), stat = "identity", position = "fill") + # use , position = "fill" for proportion plot
scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
theme_bw() +
theme(legend.position = "none", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
labs(fill = "Awardee affiliation country:") +
labs(y = "Decade", x = "Awardee proportion") Country affiliation plot by decade and by SA
#top 10 countries by SA and decade
BPindiv %>%
filter(!is.na(Country10)) %>%
count(award_SA, Decade, Country10) %>%
ggplot(aes(y = reorder(award_SA, desc(award_SA)), x = n, fill = Country10)) +
#geom_bar(stat = "identity")
geom_bar(aes(fill = Country10), stat = "identity") + # use , position = "fill" for proportion plot
scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
facet_wrap(~Decade, scales = "fixed", nrow = 1, ncol = 3) +
theme_bw() +
theme(legend.position="bottom", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
labs(fill = "Awardee affiliation country:") +
labs(x = "Awardee count", y = "Scimago Subject Area") #ggsave(here("plots", "awardee_country_SA_Decade_v1.png"), width = 18, height = 8, units = "cm", dpi = "retina")
#ggsave(here("plots", "awardee_country_SA_Decade_v1.pdf"), width = 18, height = 8, units = "cm")Combine 4 panels and save:
#assemble the panels using patchwork package
figure5 <- figure5A / figure5B / figure5C / figure5D +
plot_layout(ncol = 1, nrow = 4) +
plot_annotation(tag_levels = "A")
#ggsave(plot = figure5, here("plots", "Fig5ABCD_gender_countries_v0.png"), width = 18, height = 14, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure5, here("plots", "Fig5ABCD_gender_countries_v0.pdf"), width = 18, height = 14, units = "cm", scale = 1.2)COprod <- read_csv(here("data", "scimagojr country rank 2021.csv"), skip = 0) #load data## Rows: 235 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Region
## dbl (7): Rank, Documents, Citable documents, Citations, Self-citations, Cita...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#top10 contries
COprod$Country %in% unique(BPindiv$Country10) %>% sum() #check overlap - 8## [1] 8
setdiff(unique(BPindiv$Country10), COprod$Country) #missing: "United States" and "United Kingdom"## [1] "other" "USA" "UK" NA
COprod$Country <- gsub("United States", "USA", COprod$Country) #replace with matching name
COprod$Country <- gsub("United Kingdom", "UK", COprod$Country) #replace with a matching name
#all countries
COprod$Country %in% unique(BPindiv$affiliation_country) %>% sum() #check overlap - 40 ## [1] 40
setdiff(unique(BPindiv$affiliation_country), COprod$Country) #missing: "Vietnam" "UAE" "Russia" ## [1] "Vietnam" NA "UAE" "Russia"
COprod$Country <- gsub("Viet Nam", "Vietnam", COprod$Country) #replace with matching name
COprod$Country <- gsub("United Arab Emirates", "UAE", COprod$Country) #replace with matching name
COprod$Country <- gsub("Russian Federation", "Russia", COprod$Country) #replace with a matching name
## Plot for all countries
#recode top 10 countries + other
COprod$Country_affil <- recode(COprod$Country,
"USA" = "USA",
"UK" = "UK",
"Australia" = "Australia",
"China" = "China",
"Germany" = "Germany",
"Canada" = "Canada",
"Japan" = "Japan",
"Netherlands" = "Netherlands",
"Italy" = "Italy",
"France" = "France",
.default = "other")
#table(COprod_affil$Country_affil)
names(COprod)## [1] "Rank" "Country" "Region"
## [4] "Documents" "Citable documents" "Citations"
## [7] "Self-citations" "Citations per document" "H index"
## [10] "Country_affil"
## Plot productivity of top 10 countries and other - color productivity (Documents) likefor Countries that are in BPindiv data
COprod %>%
ggplot(aes(x = 1, y = Documents, fill = Country_affil)) + #reorder(Country_affil, desc(Documents))
geom_bar(stat = "identity", position = "fill") +
coord_flip() +
#geom_bar(aes(fill = Country_affil), stat = "identity", position = "fill") +
scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
theme_bw() +
theme(legend.position="top", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
labs(x = "Countries", y = "Documents", fill = "Awardee affiliation country:") +
scale_x_discrete(labels = NULL) #used to remove vertical labels, also breaks = NULL# ## Plot just for countries matchng BPindiv$affiliation_country
# #reduce productivity dataframe to match affiliation_country:
# COprod %>% filter(Country %in% unique(BPindiv$affiliation_country)) -> COprod_affil
#
# #recode top 10 countries + other
# COprod_affil$Country_affil <- recode(COprod_affil$Country,
# "USA" = "USA",
# "UK" = "UK",
# "Australia" = "Australia",
# "China" = "China",
# "Germany" = "Germany",
# "Canada" = "Canada",
# "Japan" = "Japan",
# "Netherlands" = "Netherlands",
# "Italy" = "Italy",
# "France" = "France",
# .default = "other")
# #table(COprod_affil$Country_affil)
# names(COprod_affil)
#
# #productivity of top 10 countries and other
# COprod_affil %>%
# ggplot(aes(x = 1, y = Documents, fill = Country_affil)) + #reorder(Country_affil, desc(Documents))
# geom_bar(stat = "identity", position = "fill") +
# coord_flip() +
# #geom_bar(aes(fill = Country_affil), stat = "identity", position = "fill") +
# scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
# theme_bw() +
# theme(legend.position="top", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
# labs(x = "Countries", y = "Documents", fill = "Awardee affiliation country:") +
# scale_x_discrete(labels = NULL) #used to remove vertical labels, also breaks = NULLFigure SX.
What are the most common first names of the past award winners?
BPindiv$first_names <- word(BPindiv$awardee_name, 1) #extract first word
#length(unique(BPindiv$first_names)) #815 unique first names, note some are initials
#count number of unique first names per decade:
BPindiv %>%
filter(!is.na(first_names)) %>%
#group_by(Decade) %>%
summarise(count = n_distinct(first_names))## # A tibble: 1 × 1
## count
## <int>
## 1 815
# #count number of unique first names per year:
# BPindiv %>%
# filter(!is.na(first_names)) %>%
# group_by(award_year) %>%
# summarise(count = n_distinct(first_names)) %>% View()
#count first_names - view 16 names with >5 counts (16 names):
BPindiv %>%
#filter(!is.na(first_names)) %>%
count(first_names) %>%
arrange(desc(n)) %>%
filter(n > 5) ## # A tibble: 16 × 2
## first_names n
## <chr> <int>
## 1 Sarah 13
## 2 David 10
## 3 Christopher 8
## 4 Michael 8
## 5 Anne 7
## 6 Daniel 7
## 7 Jonathan 7
## 8 Richard 7
## 9 Adam 6
## 10 Andrew 6
## 11 Benjamin 6
## 12 Brian 6
## 13 Eric 6
## 14 James 6
## 15 Philipp 6
## 16 William 6
#plot first names that appear more than 5 times
BPindiv %>%
#filter(!is.na(first_names)) %>%
count(first_names) %>%
arrange(desc(n)) %>%
filter(n > 5) %>%
ggplot(aes(x = reorder(first_names, n), y = n)) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
coord_flip() +
theme_bw() +
labs(x = "Winners first name", y = "Count") What are the most common affiliation institutions of the past award winners?
#length(unique(BPindiv$affiliation_institution)) #672 unique first names, note some require cleaning from extra information after ,
BPindiv$institution <- gsub(",.*", "", BPindiv$affiliation_institution) #remove everything after first comma
#length(unique(BPindiv$institution)) #581
# #count number of unique first names per decade:
# BPindiv %>%
# filter(!is.na(institution)) %>%
# group_by(Decade) %>%
# summarise(count = n_distinct(institution))
# #count number of unique first names per year:
# BPindiv %>%
# filter(!is.na(institution)) %>%
# group_by(award_year) %>%
# summarise(count = n_distinct(institution)) %>% View()
# #count institutions - view names with >5 counts (16 names):
# BPindiv %>%
# filter(!is.na(institution)) %>%
# count(institution) %>%
# arrange(desc(n)) %>%
# filter(n > 5)
#plot first names that appear more than 5 times
BPindiv %>%
filter(!is.na(institution)) %>%
count(institution) %>%
arrange(desc(n)) %>%
filter(n > 5) %>%
ggplot(aes(x = reorder(institution, n), y = n)) +
geom_bar(stat = "identity", position = position_dodge(0.9)) +
coord_flip() +
theme_bw() +
labs(x = "Winners affiliation institution", y = "Count")
Figure SX.
Award descriptions - cash award mentioned?
#table(BPdata$Award_cash) #103 no, 111 yes
figure6A <- BPdata %>%
mutate(Award_cash = factor(Award_cash, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
mutate(Award_cash = recode(Award_cash, 'no' = 'no', 'yes' = 'yes', 'not available' = 'no description')) %>% #change level names
count(Award_cash) %>%
ggplot(aes(x = 1, y = n, fill = Award_cash)) +
geom_col(width = 1, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks = c(0,50,100, 150, 200, 250)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Cash", y = "Award count", fill = "Cash award mentioned:") +
scale_x_discrete(labels = NULL) + #used to remove vertical labels, also breaks = NULL
theme(legend.position = "top", axis.title.x = element_text(size = 10))Award descriptions - cash award mentioned across SA:
#table(BPdata$Scimago_Subject_Area, BPdata$Award_cash) #as a table by SA
BPdata %>%
mutate(Award_cash = factor(Award_cash, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
mutate(Award_cash = recode(Award_cash, 'no' = 'no', 'yes' = 'yes', 'not available' = 'no description')) %>% #change level names
count(Scimago_Subject_Area, Award_cash) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_cash)) +
geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
coord_flip() +
scale_y_continuous(breaks=c(0,5,10)) +
theme_classic() +
scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
labs(x = "Scimago Subject Area", y = "Award count", fill = "Cash award mentioned:") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Award descriptions - cash award amount?
#table(!is.na(BPdata$Award_cash_max_USD_pperson)) #97 values available
#table(!is.na(BPdata$Award_cash_max_USD_pperson), BPdata$Award_individual) #of 97 values available, 40 are for individual awards
#plot overall - histogram
figure6B <- BPdata %>%
filter(!is.na(Award_cash_max_USD_pperson)) %>%
ggplot(aes(y = Award_cash_max_USD_pperson)) +
geom_histogram(binwidth = 1000, fill = "grey60", col = "white") +
coord_flip() +
theme_bw() +
labs(y = "Award cash max. USD/person", x = "Award count") +
scale_y_continuous(breaks = c(seq(0, 30000, by = 5000))) +
theme(legend.position = "none", axis.title.x = element_text(size = 10))
#Plot by SA
BPdata %>%
filter(!is.na(Award_cash_max_USD_pperson)) %>%
ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = Award_cash_max_USD_pperson)) +
geom_beeswarm(color = "grey30", alpha = 0.5) +
coord_flip() +
theme_bw() +
labs(y = "Award cash max. USD/person", x = "Scimago Subject Area") +
theme(legend.position = "none", axis.title.x = element_text(size = 10))Comment_on_the_award_cash - wordcloud of frequencies all words.
#Using packages tidytext and stopwords
Comment_on_the_award_cash_txt <- tibble(txt = tolower(BPdata$Comment_on_the_award_cash))
Comment_on_the_award_cash_txt <- Comment_on_the_award_cash_txt %>% unnest_tokens(output = word, input = txt, token = "words", to_lower = TRUE) #restructure all descriptions as one-token-per-row format
Comment_on_the_award_cash_txt <- Comment_on_the_award_cash_txt %>% anti_join(get_stopwords()) #remove stop words from library(stopwords) ## Joining with `by = join_by(word)`
Comment_on_the_award_cash_txt$word <- tokenize_word_stems(Comment_on_the_award_cash_txt$word) #make all word stems lowercase
word.freq <- Comment_on_the_award_cash_txt %>% count(word, sort = TRUE) #count words
word.freq$word <- gsub("[[:digit:]]", "", word.freq$word) #remove numbers
word.freq$word <- gsub("[[:punct:][:blank:]]+", " ", word.freq$word) #remove punctuation
word.freq <- word.freq %>% drop_na() %>% filter(word != "")
## plot wordcloud using library(ggwordcloud) for top 100 words
set.seed(111)
figure6C <- word.freq %>%
top_n(50) %>%
ggplot() +
geom_text_wordcloud_area(aes(label = word, size = n), eccentricity = 0.2, shape = "square") +
#scale_size_area(max_size = 10) +
theme_minimal() +
scale_color_gradient(low = "#CCCCCC", high = "#1A1A1A") +
theme(plot.background = element_rect(fill = "#FFFFFF", colour = "#FFFFFF")) #Ppreventing transparent background in png## Selecting by n
# figure6C <- word.freq %>% with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0, colors = brewer.pal(12, "Paired")))Combine 3 panels and save:
#assemble the panels using patchwork package
figure6 <- figure6A / (figure6B + figure6C) +
plot_layout(nrow = 2, heights = c(1, 4)) +
plot_annotation(tag_levels = "A")
#ggsave(plot = figure6, here("plots", "Fig6ABC_perks_v0.png"), width = 18, height = 8, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure6, here("plots", "Fig6ABC_perks_v0.pdf"), width = 18, height = 8, units = "cm", scale = 1.2)Count mentions of specific words (stemmed) in Comment_on_the_award_cash:
#create list of specific words (stemmed) to count within strings
specific.words2 <- c("certificate", "grant", "attend", "expens", "plaque", "talk", "invit", "registr", "meet", "conference", "member", "travel", "subscr", "board", "editor", "ticket", "article", "free", "ceremon", "congress")
#prepare award descriptions as a single lowercase string
descriptions2 <- BPdata %>%
filter(!is.na(Comment_on_the_award_cash)) %>%
select(Comment_on_the_award_cash) %>%
tolower() #single lowercase string
#sum of all mentions for each word
specific.words.mentions2 <- specific.words2 %>%
map_int(~ str_count(tolower(descriptions2), .x))
#prepare award descriptions while keeping them separate for each award
descriptions3 <- tolower(BPdata$Comment_on_the_award_cash) #vector of lowercase strings
#sum of mentions per award for each word (counts only one mention per award)
specific.words.mentions3 <- specific.words2 %>%
map_int(~ sum(str_detect(descriptions3, .x), na.rm = TRUE))
# ## doing the same as above, but manually:
# #count all mentions of words(parts) individually, e.g.:
# sum(str_count(tolower(BPdata$Comment_on_the_award_cash), "certificate"), na.rm = TRUE)
# #counting once per award, e.g.
# sum(str_detect(BPdata$Comment_on_the_award_cash, "certificate"), na.rm = TRUE)Plot frequencies of specific words - all mentions in cash descriptions.
words.df <- tibble(Words = specific.words2,
Count_all = specific.words.mentions2,
Count_once = specific.words.mentions3)
words.df %>%
ggplot(aes(x = reorder(Words, Count_all), y = Count_all)) +
geom_col(width = 0.8, fill = "#838B83") +
coord_flip() +
scale_y_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55), limits = c(0, 55)) +
theme_bw() +
labs(x = "Word stem", y = "Count of all mentions") +
theme(legend.position = "none", axis.title.x = element_text(size = 10))Plot frequencies of specific words - first mentions in cash descriptions.
words.df %>%
ggplot(aes(x = reorder(Words, Count_all), y = Count_once)) +
geom_col(width = 0.8, fill = "#C1CDC1") +
coord_flip() +
scale_y_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55), limits = c(0, 55)) +
theme_bw() +
labs(x = "Word stem", y = "Count of first mention per award") +
#scale_x_discrete(labels = NULL) + labs(x = "") + #used to remove vertical labels, also breaks = NULL
theme(legend.position = "none", axis.title.x = element_text(size = 10))Individual winners data - presence of a personal profile?
#note; based on individual winners data
#table(BPindiv$awardee_profile_shown, useNA = "always")
#plot overall
figure7A <- BPindiv %>%
mutate(awardee_profile_shown = factor(awardee_profile_shown, levels = (c("yes", "no")))) %>% #reorder value levels
group_by(award_year) %>%
count(awardee_profile_shown) %>%
ggplot(aes(x = award_year, y = n, fill = awardee_profile_shown)) +
geom_bar(aes(fill = awardee_profile_shown), stat = "identity") + # use , position = "fill" for proportion plot
scale_fill_manual(values = c("#C1FFC1", "#FA8072")) +
theme_bw() +
labs(x = "Year", y = "Winner count", fill = "Winner's profile shown") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Individual winners data - presence of a personal photo?
#table(BPindiv$awardee_photo_shown, useNA = "always")
#plot overall
figure7B <- BPindiv %>%
mutate(awardee_photo_shown = factor(awardee_photo_shown, levels = (c("yes", "no")))) %>% #reorder value levels
group_by(award_year) %>%
count(awardee_photo_shown) %>%
ggplot(aes(x = award_year, y = n, fill = awardee_photo_shown)) +
geom_bar(aes(fill = awardee_photo_shown), stat = "identity") + # use , position = "fill" for proportion plot
scale_fill_manual(values = c("#C1FFC1", "#FA8072")) +
theme_bw() +
labs(x = "Year", y = "Winner count", fill = "Winner's photo shown") +
theme(legend.position = "top", axis.title.x = element_text(size = 10))Combine 2 panels and save:
#assemble the panels using patchwork package
figure7 <- figure7A / figure7B +
plot_layout(ncol = 2, nrow = 1) +
plot_annotation(tag_levels = "A")
#ggsave(plot = figure7, here("plots", "Fig7AB_perks_v0.png"), width = 18, height = 6, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure7, here("plots", "Fig7AB_perks_v0.pdf"), width = 18, height = 6, units = "cm", scale = 1.2)Statistical analyses of the gender difference in awardee gender across years - using individual-level data
#Scale the award_year variable to have the mean of 0 and SD of 1
BPindiv$Award_year_scaled <- scale(BPindiv$award_year)
#table(BPindiv$awardee_gender, useNA = "always")
BPindiv %>%
mutate(Gender = case_when(
endsWith(awardee_gender, "F") ~ 1,
endsWith(awardee_gender, "M") ~ 0
)) -> BPindiv
#table(BPindiv$Gender)
#Fit generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
#without year
model_gender <- glmer(Gender ~ 1+ (1|award_SA) + (1|award_name), family = "binomial", data = BPindiv) #without year as a predictor
summary(model_gender)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Gender ~ 1 + (1 | award_SA) + (1 | award_name)
## Data: BPindiv
##
## AIC BIC logLik deviance df.resid
## 1429.7 1444.7 -711.9 1423.7 1080
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4325 -0.7770 -0.6408 1.0866 2.0178
##
## Random effects:
## Groups Name Variance Std.Dev.
## award_name (Intercept) 0.2365 0.4863
## award_SA (Intercept) 0.1482 0.3849
## Number of obs: 1083, groups: award_name, 62; award_SA, 19
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5005 0.1430 -3.501 0.000464 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plogis(summary(model_gender)$coef[1,1])*100 #calculate % difference between genders at the intercept: 38% female names]]## [1] 37.74133
#with year
model_gender <- glmer(Gender ~ Award_year_scaled + (1|award_SA) + (1|award_name), family = "binomial", data = BPindiv) #with year as a predictor
summary(model_gender)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Gender ~ Award_year_scaled + (1 | award_SA) + (1 | award_name)
## Data: BPindiv
##
## AIC BIC logLik deviance df.resid
## 1430.7 1450.7 -711.4 1422.7 1079
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4649 -0.7945 -0.6312 1.0715 2.1235
##
## Random effects:
## Groups Name Variance Std.Dev.
## award_name (Intercept) 0.2231 0.4724
## award_SA (Intercept) 0.1660 0.4074
## Number of obs: 1083, groups: award_name, 62; award_SA, 19
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.51269 0.14555 -3.522 0.000428 ***
## Award_year_scaled 0.07079 0.07194 0.984 0.325116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_yr_scl -0.084
plogis(summary(model_gender)$coef[1,1])*100 #calculate % difference between genders at the intercept: 37% female names## [1] 37.45634
#use the reports package (Makowski et al. 2021) to summarize the analysis
report::report(model_gender)## We fitted a logistic mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict Gender with Award_year_scaled (formula: Gender ~ Award_year_scaled).
## The model included award_SA as random effects (formula: list(~1 | award_SA, ~1
## | award_name)). The model's total explanatory power is weak (conditional R2 =
## 0.11) and the part related to the fixed effects alone (marginal R2) is of
## 1.36e-03. The model's intercept, corresponding to Award_year_scaled = 0, is at
## -0.51 (95% CI [-0.80, -0.23], p < .001). Within this model:
##
## - The effect of Award year scaled is statistically non-significant and positive
## (beta = 0.07, 95% CI [-0.07, 0.21], p = 0.325; Std. beta = 0.07, 95% CI [-0.07,
## 0.21])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
Statistical analyses of the gender difference in awardee gender across years - using individual-level data
#table(BPindiv$awardee_profile_shown, useNA = "always")
BPindiv %>%
mutate(Profile = case_when(
endsWith(awardee_profile_shown, "yes") ~ 1,
endsWith(awardee_profile_shown, "no") ~ 0
)) -> BPindiv
#Fit generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package). NOte: does not work with SA as a random effect:
model_profile <- glmer(Profile ~ Award_year_scaled + (1|award_name), family = "binomial", data = BPindiv) #with year as a predictor
summary(model_profile) #slope 1.11 ## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Profile ~ Award_year_scaled + (1 | award_name)
## Data: BPindiv
##
## AIC BIC logLik deviance df.resid
## 383.4 398.4 -188.7 377.4 1080
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7706 -0.0154 -0.0096 -0.0051 10.6139
##
## Random effects:
## Groups Name Variance Std.Dev.
## award_name (Intercept) 95.71 9.783
## Number of obs: 1083, groups: award_name, 62
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.1033 1.4117 -6.448 1.13e-10 ***
## Award_year_scaled 1.1135 0.2227 5.000 5.75e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_yr_scl -0.156
#use the reports package (Makowski et al. 2021) to summarize the analysis
report::report(model_profile)## We fitted a logistic mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict Profile with Award_year_scaled (formula: Profile ~
## Award_year_scaled). The model included award_name as random effect (formula: ~1
## | award_name). The model's total explanatory power is substantial (conditional
## R2 = 0.97) and the part related to the fixed effects alone (marginal R2) is of
## 0.01. The model's intercept, corresponding to Award_year_scaled = 0, is at
## -9.10 (95% CI [-11.87, -6.34], p < .001). Within this model:
##
## - The effect of Award year scaled is statistically significant and positive
## (beta = 1.11, 95% CI [0.68, 1.55], p < .001; Std. beta = 1.11, 95% CI [0.68,
## 1.55])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
Statistical analyses of the gender difference in awardee gender across years - using individual-level data
#table(BPindiv$awardee_photo_shown, useNA = "always")
BPindiv %>%
mutate(Photo = case_when(
endsWith(awardee_photo_shown, "yes") ~ 1,
endsWith(awardee_photo_shown, "no") ~ 0
)) -> BPindiv
#Fit generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package). Note: does not work with SA as a random effect:
model_photo <- glmer(Photo ~ Award_year_scaled + (1|award_name), family = "binomial", data = BPindiv) #with year as a predictor
summary(model_photo) #slope 1.18## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Photo ~ Award_year_scaled + (1 | award_name)
## Data: BPindiv
##
## AIC BIC logLik deviance df.resid
## 381.3 396.3 -187.6 375.3 1080
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.5501 -0.0102 -0.0027 0.0353 2.1078
##
## Random effects:
## Groups Name Variance Std.Dev.
## award_name (Intercept) 194.9 13.96
## Number of obs: 1083, groups: award_name, 62
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.6806 1.3957 -6.936 4.03e-12 ***
## Award_year_scaled 1.1822 0.2378 4.972 6.63e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_yr_scl -0.173
#use the reports package (Makowski et al. 2021) to summarize the analysis
report::report(model_photo)## We fitted a logistic mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict Photo with Award_year_scaled (formula: Photo ~ Award_year_scaled).
## The model included award_name as random effect (formula: ~1 | award_name). The
## model's total explanatory power is substantial (conditional R2 = 0.98) and the
## part related to the fixed effects alone (marginal R2) is of 7.00e-03. The
## model's intercept, corresponding to Award_year_scaled = 0, is at -9.68 (95% CI
## [-12.42, -6.95], p < .001). Within this model:
##
## - The effect of Award year scaled is statistically significant and positive
## (beta = 1.18, 95% CI [0.72, 1.65], p < .001; Std. beta = 1.18, 95% CI [0.72,
## 1.65])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
NOTE: Using the year of oldest listed past winner as a proxy of award establishement year.
#Scale the Awardee_list_earliest_year variable to have the mean of 0 and SD of 1
BPdata$Awardee_list_earliest_year_scaled <- scale(BPdata$Awardee_list_earliest_year)
#table(!is.na(BPdata$Awardee_list_earliest_year)) # values available
## try logistic regressions with Awardee_list_earliest_year for the following variables:
#Award_individual, Flexible_eligibility, Inclusivity_statement, Assessors_transparency, Award_integrity_mentioned, Process_transparency, Self_nomination, Letter_required, Feedback_availability, Award_contact_provided, Criteria_transparency, Award_impact_metrics_mentioned, Award_impact_metrics_only, Open_science
## Award_individual
#table
table(BPdata$Award_individual, useNA = "always")##
## no yes <NA>
## 156 66 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
mutate(Award_individual_num = recode(Award_individual, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Award_individual_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Award_individual") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_individual <- glmer(Award_individual_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area) + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Award_individual) #slope ns## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Award_individual_num ~ Awardee_list_earliest_year_scaled + (1 |
## Scimago_Subject_Area) + (1 | Awarding_society)
## Data: BPdata4
##
## AIC BIC logLik deviance df.resid
## 229.9 243.1 -110.9 221.9 199
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3911 -0.5661 -0.2933 0.6713 2.0516
##
## Random effects:
## Groups Name Variance Std.Dev.
## Awarding_society (Intercept) 0.2334 0.4831
## Scimago_Subject_Area (Intercept) 2.3012 1.5170
## Number of obs: 203, groups: Awarding_society, 118; Scimago_Subject_Area, 27
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0512 0.3854 -2.727 0.00639 **
## Awardee_list_earliest_year_scaled 0.2523 0.2318 1.088 0.27650
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_lst___ 0.040
#report::report(model_Award_individual) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Flexible_eligibility
#table
table(BPdata$Flexible_eligibility, useNA = "always")##
## no not applicable not available yes <NA>
## 21 185 11 5 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Flexible_eligibility == "yes" | Flexible_eligibility == "no") %>%
mutate(Flexible_eligibility_num = recode(Flexible_eligibility, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Flexible_eligibility_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Flexible_eligibility") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
#model_Flexible_eligibility <- glmer(Flexible_eligibility_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area) + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor - NOT CONVERGING with random effects included
model_Flexible_eligibility <- glm(Flexible_eligibility_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor, without random effects included
summary(model_Flexible_eligibility) #slope ns##
## Call:
## glm(formula = Flexible_eligibility_num ~ Awardee_list_earliest_year_scaled,
## family = "binomial", data = BPdata4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7852 -0.6813 -0.6704 -0.6629 1.7989
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.32990 0.50324 -2.643 0.00823 **
## Awardee_list_earliest_year_scaled -0.08686 0.48362 -0.180 0.85747
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.564 on 23 degrees of freedom
## Residual deviance: 24.532 on 22 degrees of freedom
## AIC: 28.532
##
## Number of Fisher Scoring iterations: 4
#report::report(model_Flexible_eligibility) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Assessors_transparency
#table
table(BPdata$Assessors_transparency, useNA = "always")##
## no not available yes <NA>
## 98 10 114 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Assessors_transparency == "yes" | Assessors_transparency == "no") %>%
mutate(Assessors_transparency_num = recode(Assessors_transparency, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Assessors_transparency_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Assessors_transparency") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Assessors_transparency <- glmer(Assessors_transparency_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area), family = "binomial", data = BPdata4) #with year as a predictor, + (1|Awarding_society) causes "boundary (singular)"
summary(model_Assessors_transparency) #slope ns## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Assessors_transparency_num ~ Awardee_list_earliest_year_scaled +
## (1 | Scimago_Subject_Area)
## Data: BPdata4
##
## AIC BIC logLik deviance df.resid
## 272.5 282.3 -133.3 266.5 191
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4059 -0.9351 0.7128 0.9220 1.2300
##
## Random effects:
## Groups Name Variance Std.Dev.
## Scimago_Subject_Area (Intercept) 0.2852 0.534
## Number of obs: 194, groups: Scimago_Subject_Area, 27
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.06852 0.18168 0.377 0.706
## Awardee_list_earliest_year_scaled 0.02803 0.15627 0.179 0.858
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_lst___ 0.012
#report::report(model_Assessors_transparency) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Award_integrity_mentioned - more likely in older awards!
#table
table(BPdata$Award_integrity_mentioned, useNA = "always")##
## no not available yes <NA>
## 192 11 19 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Award_integrity_mentioned == "yes" | Award_integrity_mentioned == "no") %>%
mutate(Award_integrity_mentioned_num = recode(Award_integrity_mentioned, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Award_integrity_mentioned_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Award_integrity_mentioned") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_integrity_mentioned <- glmer(Award_integrity_mentioned_num ~ Awardee_list_earliest_year_scaled + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor, + (1|Scimago_Subject_Area) causes "boundary (singular)"
summary(model_Award_integrity_mentioned) #slope ns, but close## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Award_integrity_mentioned_num ~ Awardee_list_earliest_year_scaled +
## (1 | Awarding_society)
## Data: BPdata4
##
## AIC BIC logLik deviance df.resid
## 85.6 95.4 -39.8 79.6 190
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.1129 -0.0979 -0.0037 -0.0023 4.6810
##
## Random effects:
## Groups Name Variance Std.Dev.
## Awarding_society (Intercept) 559.1 23.64
## Number of obs: 193, groups: Awarding_society, 115
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.6213 2.6581 -4.372 1.23e-05 ***
## Awardee_list_earliest_year_scaled -1.1068 0.7514 -1.473 0.141
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_lst___ 0.657
#report::report(model_Award_integrity_mentioned) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Self_nomination - trend for newer awards
#table
table(BPdata$Self_nomination, useNA = "always")##
## no not available yes <NA>
## 183 11 28 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Self_nomination == "yes" | Self_nomination == "no") %>%
mutate(Self_nomination_num = recode(Self_nomination, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Self_nomination_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Self_nomination") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Self_nomination <- glmer(Self_nomination_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Self_nomination) #slope ns, but trend for newer awards to mention self-nominations## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Self_nomination_num ~ Awardee_list_earliest_year_scaled + (1 |
## Scimago_Subject_Area)
## Data: BPdata4
##
## AIC BIC logLik deviance df.resid
## 154.0 163.8 -74.0 148.0 190
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.6315 -0.4125 -0.3501 -0.2592 4.8781
##
## Random effects:
## Groups Name Variance Std.Dev.
## Scimago_Subject_Area (Intercept) 0.3627 0.6023
## Number of obs: 193, groups: Scimago_Subject_Area, 27
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0653 0.3095 -6.673 2.51e-11 ***
## Awardee_list_earliest_year_scaled 0.5658 0.3225 1.754 0.0793 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_lst___ -0.343
#report::report(model_Self_nomination) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Letter_required - less and less recently
#table
table(BPdata$Letter_required, useNA = "always")##
## no not available yes <NA>
## 173 11 38 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Letter_required == "yes" | Letter_required == "no") %>%
mutate(Letter_required_num = recode(Letter_required, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Letter_required_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Letter_required") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Letter_required <- glmer(Letter_required_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area) + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Letter_required) #slope significant negative -0.5431 - less common over the years## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Letter_required_num ~ Awardee_list_earliest_year_scaled + (1 |
## Scimago_Subject_Area) + (1 | Awarding_society)
## Data: BPdata4
##
## AIC BIC logLik deviance df.resid
## 176.8 189.9 -84.4 168.8 189
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6478 -0.4311 -0.3374 -0.2687 2.6569
##
## Random effects:
## Groups Name Variance Std.Dev.
## Awarding_society (Intercept) 0.1654 0.4067
## Scimago_Subject_Area (Intercept) 0.5556 0.7454
## Number of obs: 193, groups: Awarding_society, 115; Scimago_Subject_Area, 27
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7578 0.3110 -5.653 1.58e-08 ***
## Awardee_list_earliest_year_scaled -0.5425 0.1929 -2.812 0.00492 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_lst___ 0.208
#report::report(model_Letter_required) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Award_contact_provided
#table
table(BPdata$Award_contact_provided, useNA = "always")##
## no not available yes <NA>
## 173 11 38 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Award_contact_provided == "yes" | Award_contact_provided == "no") %>%
mutate(Award_contact_provided_num = recode(Award_contact_provided, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Award_contact_provided_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Award_individual") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_contact_provided <- glm(Award_contact_provided_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor - but fails with random effects
summary(model_Award_contact_provided) #slope signif negative -0.3314 - less likely recently##
## Call:
## glm(formula = Award_contact_provided_num ~ Awardee_list_earliest_year_scaled,
## family = "binomial", data = BPdata4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0404 -0.6292 -0.5738 -0.5426 1.9943
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5473 0.1923 -8.048 8.41e-16 ***
## Awardee_list_earliest_year_scaled -0.3283 0.1625 -2.021 0.0433 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182.74 on 192 degrees of freedom
## Residual deviance: 178.87 on 191 degrees of freedom
## AIC: 182.87
##
## Number of Fisher Scoring iterations: 4
#report::report(model_Award_contact_provided) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Criteria_transparency
#table
table(BPdata$Criteria_transparency, useNA = "always")##
## no not available yes <NA>
## 170 12 40 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Criteria_transparency == "yes" | Criteria_transparency == "no") %>%
mutate(Criteria_transparency_num = recode(Criteria_transparency, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Criteria_transparency_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Criteria_transparency") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Criteria_transparency <- glm(Criteria_transparency_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Criteria_transparency) #slope ns ##
## Call:
## glm(formula = Criteria_transparency_num ~ Awardee_list_earliest_year_scaled,
## family = "binomial", data = BPdata4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7817 -0.7168 -0.6488 -0.4688 2.2484
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4415 0.1891 -7.623 2.48e-14 ***
## Awardee_list_earliest_year_scaled 0.4048 0.2412 1.678 0.0934 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.04 on 191 degrees of freedom
## Residual deviance: 187.57 on 190 degrees of freedom
## AIC: 191.57
##
## Number of Fisher Scoring iterations: 4
#report::report(model_Criteria_transparency) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Award_impact_metrics_mentioned - mostly last 10 years
#table
table(BPdata$Award_impact_metrics_mentioned, useNA = "always")##
## no not available yes <NA>
## 190 11 21 0
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Award_impact_metrics_mentioned == "yes" | Award_impact_metrics_mentioned == "no") %>%
mutate(Award_impact_metrics_mentioned_num = recode(Award_impact_metrics_mentioned, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Award_impact_metrics_mentioned_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Award_impact_metrics_mentioned") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_impact_metrics_mentioned <- glmer(Award_impact_metrics_mentioned_num ~ Awardee_list_earliest_year_scaled + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Award_impact_metrics_mentioned) #slope signif 4.930 - recently mentioned more often## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## Award_impact_metrics_mentioned_num ~ Awardee_list_earliest_year_scaled +
## (1 | Awarding_society)
## Data: BPdata4
##
## AIC BIC logLik deviance df.resid
## 90.1 99.9 -42.1 84.1 190
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.99218 -0.01514 -0.00158 -0.00009 2.46961
##
## Random effects:
## Groups Name Variance Std.Dev.
## Awarding_society (Intercept) 916.8 30.28
## Number of obs: 193, groups: Awarding_society, 115
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.264 3.262 -4.373 1.23e-05 ***
## Awardee_list_earliest_year_scaled 4.870 2.256 2.158 0.0309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Awrd_lst___ -0.570
#report::report(model_Award_impact_metrics_mentioned) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Award_impact_metrics_only - seem to be from the last 5 years!!
#table
table(BPdata$Award_impact_metrics_only, useNA = "always")##
## no not available yes <NA>
## 203 11 8 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Award_impact_metrics_only == "yes" | Award_impact_metrics_only == "no") %>%
mutate(Award_impact_metrics_only_num = recode(Award_impact_metrics_only, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Award_impact_metrics_only_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Award_impact_metrics_only") +
theme_minimal() ## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_impact_metrics_only <- glm(Award_impact_metrics_only_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor, fails with random effects
summary(model_Award_impact_metrics_only) #slope signif 3.782 - more common recently##
## Call:
## glm(formula = Award_impact_metrics_only_num ~ Awardee_list_earliest_year_scaled,
## family = "binomial", data = BPdata4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.69414 -0.36191 -0.16275 -0.03612 2.44021
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.154 1.289 -3.998 6.4e-05 ***
## Awardee_list_earliest_year_scaled 3.784 1.705 2.219 0.0265 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 66.596 on 192 degrees of freedom
## Residual deviance: 54.872 on 191 degrees of freedom
## AIC: 58.872
##
## Number of Fisher Scoring iterations: 8
#report::report(model_Award_impact_metrics_only) #use the reports package (Makowski et al. 2021) to summarize the analysis
## Open_science
#table
table(BPdata$Open_science, useNA = "always")##
## no not available yes <NA>
## 210 11 1 0
#filter and make binomial
BPdata %>%
filter(!is.na(Awardee_list_earliest_year)) %>%
filter(Open_science == "yes" | Open_science == "no") %>%
mutate(Open_science_num = recode(Open_science, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>%
ggplot(aes(x = Awardee_list_earliest_year, y = Open_science_num)) +
geom_point(alpha = 0.2) +
stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
xlab("Year of first listed awardee") +
ylab("Probability of Open_science") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Open_science <- glm(Open_science_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor, fails with random effects
summary(model_Open_science) #slope ns##
## Call:
## glm(formula = Open_science_num ~ Awardee_list_earliest_year_scaled,
## family = "binomial", data = BPdata4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.53186 -0.08916 -0.07063 -0.05999 2.98784
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.7776 1.3438 -4.299 1.71e-05 ***
## Awardee_list_earliest_year_scaled -0.7611 0.5281 -1.441 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12.520 on 192 degrees of freedom
## Residual deviance: 10.936 on 191 degrees of freedom
## AIC: 14.936
##
## Number of Fisher Scoring iterations: 8
#report::report(model_Open_science) #use the reports package (Makowski et al. 2021) to summarize the analysisTable of logistic regression results? ## try logistic regressions against Awardee_list_earliest_year for the following variables: #Award_individual, Flexible_eligibility, Inclusivity_statement, Assessors_transparency, Award_integrity_mentioned, Process_transparency, Self_nomination, Letter_required, Feedback_availability, Award_contact_provided, Criteria_transparency, Award_impact_metrics_mentioned, Award_impact_metrics_only, Open_science
report::report_effectsize(model_Award_impact_metrics_mentioned)
report::report_effectsize(model_Open_science)
r <- report::report(model_Open_science)
r <- report::report_table(model_Open_science)
r <- report::report_model(model_Open_science)
r
str(r)
summary(r)
names(as.data.frame(r))
str(model_Award_individual)
report::report(model_Award_individual)$EffectsTRY tables based on: https://cran.r-project.org/web/packages/sjPlot/vignettes/tab_mixed.html
library(sjPlot)
tab_model(model_Award_individual)| Award_individual_num | |||
|---|---|---|---|
| Predictors | Odds Ratios | CI | p |
| (Intercept) | 0.35 | 0.16 – 0.74 | 0.006 |
|
Awardee list earliest year scaled |
1.29 | 0.82 – 2.03 | 0.276 |
| Random Effects | |||
| σ2 | 3.29 | ||
| τ00 Awarding_society | 0.23 | ||
| τ00 Scimago_Subject_Area | 2.30 | ||
| ICC | 0.44 | ||
| N Scimago_Subject_Area | 27 | ||
| N Awarding_society | 115 | ||
| Observations | 203 | ||
| Marginal R2 / Conditional R2 | 0.011 / 0.441 | ||